From eafb0dc37d58c27f20f687913bfefc0b0d406897 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 09:10:11 +0200 Subject: [PATCH 01/46] Add MetalExt and weak dependency --- Project.toml | 2 ++ ext/ParallelStencil_MetalExt.jl | 4 ++++ 2 files changed, 6 insertions(+) create mode 100644 ext/ParallelStencil_MetalExt.jl diff --git a/Project.toml b/Project.toml index 2fb00947..38bfef7f 100644 --- a/Project.toml +++ b/Project.toml @@ -13,12 +13,14 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" +Metal = "dde4c033-4e86-420c-a63e-0dd931031962" Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" [extensions] ParallelStencil_AMDGPUExt = "AMDGPU" ParallelStencil_CUDAExt = "CUDA" ParallelStencil_EnzymeExt = "Enzyme" +ParallelStencil_MetalExt = "Metal" [compat] AMDGPU = "0.6, 0.7, 0.8, 0.9, 1" diff --git a/ext/ParallelStencil_MetalExt.jl b/ext/ParallelStencil_MetalExt.jl new file mode 100644 index 00000000..1c76be72 --- /dev/null +++ b/ext/ParallelStencil_MetalExt.jl @@ -0,0 +1,4 @@ +module ParallelStencil_MetalExt + # include(joinpath(@__DIR__, "..", "src", "ParallelKernel", "MetalExt", "shared.jl")) + # include(joinpath(@__DIR__, "..", "src", "ParallelKernel", "MetalExt", "allocators.jl")) +end \ No newline at end of file From 33e46e885f3ba68fb431646db5616758a7c33899 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 09:13:40 +0200 Subject: [PATCH 02/46] Create entry point files (still empty) --- ext/ParallelStencil_MetalExt.jl | 4 ++-- src/ParallelKernel/MetalExt/allocators.jl | 0 src/ParallelKernel/MetalExt/defaults.jl | 0 src/ParallelKernel/MetalExt/shared.jl | 0 4 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 src/ParallelKernel/MetalExt/allocators.jl create mode 100644 src/ParallelKernel/MetalExt/defaults.jl create mode 100644 src/ParallelKernel/MetalExt/shared.jl diff --git a/ext/ParallelStencil_MetalExt.jl b/ext/ParallelStencil_MetalExt.jl index 1c76be72..254aac1e 100644 --- a/ext/ParallelStencil_MetalExt.jl +++ b/ext/ParallelStencil_MetalExt.jl @@ -1,4 +1,4 @@ module ParallelStencil_MetalExt - # include(joinpath(@__DIR__, "..", "src", "ParallelKernel", "MetalExt", "shared.jl")) - # include(joinpath(@__DIR__, "..", "src", "ParallelKernel", "MetalExt", "allocators.jl")) + include(joinpath(@__DIR__, "..", "src", "ParallelKernel", "MetalExt", "shared.jl")) + include(joinpath(@__DIR__, "..", "src", "ParallelKernel", "MetalExt", "allocators.jl")) end \ No newline at end of file diff --git a/src/ParallelKernel/MetalExt/allocators.jl b/src/ParallelKernel/MetalExt/allocators.jl new file mode 100644 index 00000000..e69de29b diff --git a/src/ParallelKernel/MetalExt/defaults.jl b/src/ParallelKernel/MetalExt/defaults.jl new file mode 100644 index 00000000..e69de29b diff --git a/src/ParallelKernel/MetalExt/shared.jl b/src/ParallelKernel/MetalExt/shared.jl new file mode 100644 index 00000000..e69de29b From 1fdcc74ad879fb77631a799f128c9fbea736a7b7 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 12:19:45 +0200 Subject: [PATCH 03/46] Add defaults --- src/ParallelKernel/MetalExt/defaults.jl | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/ParallelKernel/MetalExt/defaults.jl b/src/ParallelKernel/MetalExt/defaults.jl index e69de29b..16750d52 100644 --- a/src/ParallelKernel/MetalExt/defaults.jl +++ b/src/ParallelKernel/MetalExt/defaults.jl @@ -0,0 +1,18 @@ +const ERRMSG_METALEXT_NOT_LOADED = "the Metal extension was not loaded. Make sure to import Metal before ParallelStencil." + +# shared.jl + +function get_priority_mtlstream end +function get_mtlstream end + +# allocators + +zeros_metal(arg...) = @NotLoadedError(ERRMSG_METALEXT_NOT_LOADED) +ones_metal(arg...) = @NotLoadedError(ERRMSG_METALEXT_NOT_LOADED) +rand_metal(arg...) = @NotLoadedError(ERRMSG_METALEXT_NOT_LOADED) +falses_metal(arg...) = @NotLoadedError(ERRMSG_METALEXT_NOT_LOADED) +trues_metal(arg...) = @NotLoadedError(ERRMSG_METALEXT_NOT_LOADED) +fill_metal(arg...) = @NotLoadedError(ERRMSG_METALEXT_NOT_LOADED) +fill_metal!(arg...) = @NotLoadedError(ERRMSG_METALEXT_NOT_LOADED) + + From d98ab6fe7ccc89a4edb34f4f4ae787cf248634ad Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 13:43:22 +0200 Subject: [PATCH 04/46] Add shared functions --- src/ParallelKernel/MetalExt/shared.jl | 32 +++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/ParallelKernel/MetalExt/shared.jl b/src/ParallelKernel/MetalExt/shared.jl index e69de29b..52c3801e 100644 --- a/src/ParallelKernel/MetalExt/shared.jl +++ b/src/ParallelKernel/MetalExt/shared.jl @@ -0,0 +1,32 @@ +import ParallelStencil +import ParallelStencil.ParallelKernel: INT_METAL, rand_cpu, fill_cpu, construct_cell, check_datatype, rand_metal, fill_metal +using ParallelStencil.ParallelKernel.Exceptions +using Metal, CellArrays, StaticArrays +import Metal.MTL + +## TODO add Metal backend for CellArray +# @define_MetalCellArray + +## FUNCTIONS TO CHECK EXTENSIONS SUPPORT + +ParallelStencil.ParallelKernel.is_loaded(::Val{:ParallelStencil_MetalExt}) = true + +## FUNCTIONS TO GET CREATE AND MANAGE METAL QUEUES + +ParallelStencil.ParallelKernel.get_priority_metalqueue(arg...) = get_priority_metalqueue(arg...) +ParallelStencil.ParallelKernel.get_metalqueue(arg...) = get_metalqueue(arg...) +let + global get_priority_metalqueue, get_metalqueue + priority_metalqueues = Array{MTLCommandQueue}(undef, 0) + metalqueues = Array{MTLCommandQueue}(undef, 0) + + function get_priority_metalqueue(id::Integer) + while (id > length(priority_metalqueues)) push!(priority_metalqueues, MTL.MTLCommandQueue(device())) end # No priority setting available in Metal queues. + return priority_metalqueues[id] + end + + function get_metalqueue(id::Integer) + while (id > length(metalqueues)) push!(metalqueues, MTLCommandQueue(MTLDevice.default_device())) end + return metalqueues[id] + end +end \ No newline at end of file From 649dad4aff0acccbd54fa7bc40199f6f7419d995 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 13:44:19 +0200 Subject: [PATCH 05/46] Fix shared functions --- src/ParallelKernel/MetalExt/shared.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ParallelKernel/MetalExt/shared.jl b/src/ParallelKernel/MetalExt/shared.jl index 52c3801e..9a60baf4 100644 --- a/src/ParallelKernel/MetalExt/shared.jl +++ b/src/ParallelKernel/MetalExt/shared.jl @@ -26,7 +26,7 @@ let end function get_metalqueue(id::Integer) - while (id > length(metalqueues)) push!(metalqueues, MTLCommandQueue(MTLDevice.default_device())) end + while (id > length(metalqueues)) push!(metalqueues, MTL.MTLCommandQueue(device())) end return metalqueues[id] end end \ No newline at end of file From f633876c636e52192a6ed09c75c91f2fb1a954ed Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 14:22:27 +0200 Subject: [PATCH 06/46] Define Metal constants --- src/ParallelKernel/shared.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ParallelKernel/shared.jl b/src/ParallelKernel/shared.jl index 17d0a817..63a303e7 100644 --- a/src/ParallelKernel/shared.jl +++ b/src/ParallelKernel/shared.jl @@ -12,12 +12,14 @@ gensym_world(tag::Expr, generator::Module) = gensym(string(tag, GENSYM_SEPARAT const PKG_CUDA = :CUDA const PKG_AMDGPU = :AMDGPU +const PKG_METAL = :Metal const PKG_THREADS = :Threads const PKG_POLYESTER = :Polyester const PKG_NONE = :PKG_NONE const SUPPORTED_PACKAGES = [PKG_THREADS, PKG_POLYESTER, PKG_CUDA, PKG_AMDGPU] const INT_CUDA = Int64 # NOTE: unsigned integers are not yet supported (proper negative offset and range is dealing missing) const INT_AMDGPU = Int64 # NOTE: ... +const INT_METAL = Int64 # NOTE: ... const INT_POLYESTER = Int64 # NOTE: ... const INT_THREADS = Int64 # NOTE: ... const NTHREADS_X_MAX = 32 From 91ab97b54b85194d59167807a073ce33ba7f3195 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 14:24:18 +0200 Subject: [PATCH 07/46] Add Metal kernel int type --- src/ParallelKernel/MetalExt/shared.jl | 2 -- src/ParallelKernel/shared.jl | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ParallelKernel/MetalExt/shared.jl b/src/ParallelKernel/MetalExt/shared.jl index 9a60baf4..8f9587bb 100644 --- a/src/ParallelKernel/MetalExt/shared.jl +++ b/src/ParallelKernel/MetalExt/shared.jl @@ -8,11 +8,9 @@ import Metal.MTL # @define_MetalCellArray ## FUNCTIONS TO CHECK EXTENSIONS SUPPORT - ParallelStencil.ParallelKernel.is_loaded(::Val{:ParallelStencil_MetalExt}) = true ## FUNCTIONS TO GET CREATE AND MANAGE METAL QUEUES - ParallelStencil.ParallelKernel.get_priority_metalqueue(arg...) = get_priority_metalqueue(arg...) ParallelStencil.ParallelKernel.get_metalqueue(arg...) = get_metalqueue(arg...) let diff --git a/src/ParallelKernel/shared.jl b/src/ParallelKernel/shared.jl index 63a303e7..d6bf9efb 100644 --- a/src/ParallelKernel/shared.jl +++ b/src/ParallelKernel/shared.jl @@ -75,6 +75,7 @@ macro rangelengths() esc(:(($(RANGELENGTHS_VARNAMES...),))) end function kernel_int_type(package::Symbol) if (package == PKG_CUDA) int_type = INT_CUDA elseif (package == PKG_AMDGPU) int_type = INT_AMDGPU + elseif (package == PKG_METAL) int_type = INT_METAL elseif (package == PKG_THREADS) int_type = INT_THREADS elseif (package == PKG_POLYESTER) int_type = INT_POLYESTER end From 3059741021a3ba89ee3deb38038f04ad07b2b66f Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 14:41:53 +0200 Subject: [PATCH 08/46] Add Metal allocators (not everything for CellArrays just yet) --- src/ParallelKernel/MetalExt/allocators.jl | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/ParallelKernel/MetalExt/allocators.jl b/src/ParallelKernel/MetalExt/allocators.jl index e69de29b..1d6697ef 100644 --- a/src/ParallelKernel/MetalExt/allocators.jl +++ b/src/ParallelKernel/MetalExt/allocators.jl @@ -0,0 +1,29 @@ +## RUNTIME ALLOCATOR FUNCTIONS + +ParallelStencil.ParallelKernel.zeros_metal(::Type{T}, blocklength, args...) where {T<:Number} = (check_datatype_metal(T); Metal.zeros(T, args...)) # (blocklength is ignored if neither celldims nor celltype is set) +ParallelStencil.ParallelKernel.ones_metal(::Type{T}, blocklength, args...) where {T<:Number} = (check_datatype_metal(T); Metal.ones(T, args...)) +ParallelStencil.ParallelKernel.rand_metal(::Type{T}, blocklength, args...) where {T<:Union{Number,Enum}} = (check_datatype_metal(T); MtlArray(rand_cpu(T, blocklength, args...))) +ParallelStencil.ParallelKernel.falses_metal(::Type{T}, blocklength, args...) where {T<:Bool} = Metal.falses(args...) +ParallelStencil.ParallelKernel.trues_metal(::Type{T}, blocklength, args...) where {T<:Bool} = Metal.trues(args...) +ParallelStencil.ParallelKernel.fill_metal(::Type{T}, blocklength, args...) where {T<:Union{Number,Enum}} = MtlArray(fill_cpu(T, blocklength, args...)) + +ParallelStencil.ParallelKernel.zeros_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = (check_datatype_metal(T); fill_metal(T, blocklength, 0, args...)) +ParallelStencil.ParallelKernel.ones_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = (check_datatype_metal(T); fill_metal(T, blocklength, 1, args...)) +ParallelStencil.ParallelKernel.rand_metal(::Type{T}, ::Val{B}, dims) where {T<:Union{SArray,FieldArray}, B} = (check_datatype_metal(T, Bool, Enum); blocklen = (B == 0) ? prod(dims) : B; CellArray{T,length(dims),B, Metal.MtlArray{eltype(T),3}}(Metal.MtlArray(Base.rand(eltype(T), blocklen, prod(size(T)), ceil(Int,prod(dims)/(blocklen))), dims))) +ParallelStencil.ParallelKernel.rand_metal(::Type{T}, blocklength, dims...) where {T<:Union{SArray,FieldArray}} = rand_metal(T, blocklength, dims) +ParallelStencil.ParallelKernel.falses_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = fill_metal(T, blocklength, false, args...) +ParallelStencil.ParallelKernel.trues_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = fill_metal(T, blocklength, true, args...) + +# function ParallelStencil.ParallelKernel.fill_metal(::Type{T}, ::Val{B}, x, args...) where {T <: Union{SArray,FieldArray}, B} +# if (!(eltype(x) <: Number) || (eltype(x) == Bool)) && (eltype(x) != eltype(T)) @ArgumentError("fill: the (element) type of argument 'x' is not a normal number type ($(eltype(x))), but does not match the obtained (default) 'eltype' ($(eltype(T))); automatic conversion to $(eltype(T)) is therefore not attempted. Set the keyword argument 'eltype' accordingly to the element type of 'x' or pass an 'x' of a different (element) type.") end +# check_datatype_metal(T, Bool, Enum) +# if (length(x) == 1) cell = convert(T, fill(convert(eltype(T), x), size(T))) +# elseif (length(x) == length(T)) cell = convert(T, x) +# else @ArgumentError("fill: argument 'x' contains the wrong number of elements ($(length(x))). It must be a scalar or contain the number of elements defined by 'celldims'.") +# end +# return CellArrays.fill!(MtlCellArray{T,B}(undef, args...), cell) +# end + +# ParallelStencil.ParallelKernel.fill_metal!(A, x) = Metal.fill!(A, construct_cell(A, x)) + +check_datatype_metal(args...) = check_datatype(args..., INT_METAL) \ No newline at end of file From 7490133b905dec2af04fb1667fc6d0a8488ed8d6 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 14:45:47 +0200 Subject: [PATCH 09/46] Add more Metal allocators --- src/ParallelKernel/allocators.jl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/ParallelKernel/allocators.jl b/src/ParallelKernel/allocators.jl index 90b8e240..0877126e 100644 --- a/src/ParallelKernel/allocators.jl +++ b/src/ParallelKernel/allocators.jl @@ -240,6 +240,13 @@ macro falses_amdgpu(args...) check_initialized(__module__); esc(_falses(__mod macro trues_amdgpu(args...) check_initialized(__module__); esc(_trues(__module__, args...; package=PKG_AMDGPU)); end macro fill_amdgpu(args...) check_initialized(__module__); esc(_fill(__module__, args...; package=PKG_AMDGPU)); end macro fill!_amdgpu(args...) check_initialized(__module__); esc(_fill!(__module__, args...; package=PKG_AMDGPU)); end +macro zeros_metal(args...) check_initialized(__module__); esc(_zeros(__module__, args...; package=PKG_METAL)); end +macro ones_metal(args...) check_initialized(__module__); esc(_ones(__module__, args...; package=PKG_METAL)); end +macro rand_metal(args...) check_initialized(__module__); esc(_rand(__module__, args...; package=PKG_METAL)); end +macro falses_metal(args...) check_initialized(__module__); esc(_falses(__module__, args...; package=PKG_METAL)); end +macro trues_metal(args...) check_initialized(__module__); esc(_trues(__module__, args...; package=PKG_METAL)); end +macro fill_metal(args...) check_initialized(__module__); esc(_fill(__module__, args...; package=PKG_METAL)); end +macro fill!_metal(args...) check_initialized(__module__); esc(_fill!(__module__, args...; package=PKG_METAL)); end macro zeros_threads(args...) check_initialized(__module__); esc(_zeros(__module__, args...; package=PKG_THREADS)); end macro ones_threads(args...) check_initialized(__module__); esc(_ones(__module__, args...; package=PKG_THREADS)); end macro rand_threads(args...) check_initialized(__module__); esc(_rand(__module__, args...; package=PKG_THREADS)); end @@ -274,6 +281,7 @@ function _zeros(caller::Module, args...; eltype=nothing, celldims=nothing, cellt blocklength = determine_blocklength(blocklength, package) if (package == PKG_CUDA) return :(ParallelStencil.ParallelKernel.zeros_cuda($celltype, $blocklength, $(args...))) elseif (package == PKG_AMDGPU) return :(ParallelStencil.ParallelKernel.zeros_amdgpu($celltype, $blocklength, $(args...))) + elseif (package == PKG_METAL) return :(ParallelStencil.ParallelKernel.zeros_metal($celltype, $blocklength, $(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.zeros_cpu($celltype, $blocklength, $(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -284,6 +292,7 @@ function _ones(caller::Module, args...; eltype=nothing, celldims=nothing, cellty blocklength = determine_blocklength(blocklength, package) if (package == PKG_CUDA) return :(ParallelStencil.ParallelKernel.ones_cuda($celltype, $blocklength, $(args...))) elseif (package == PKG_AMDGPU) return :(ParallelStencil.ParallelKernel.ones_amdgpu($celltype, $blocklength, $(args...))) + elseif (package == PKG_METAL) return :(ParallelStencil.ParallelKernel.ones_metal($celltype, $blocklength, $(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.ones_cpu($celltype, $blocklength, $(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -294,6 +303,7 @@ function _rand(caller::Module, args...; eltype=nothing, celldims=nothing, cellty blocklength = determine_blocklength(blocklength, package) if (package == PKG_CUDA) return :(ParallelStencil.ParallelKernel.rand_cuda($celltype, $blocklength, $(args...))) elseif (package == PKG_AMDGPU) return :(ParallelStencil.ParallelKernel.rand_amdgpu($celltype, $blocklength, $(args...))) + elseif (package == PKG_METAL) return :(ParallelStencil.ParallelKernel.rand_metal($celltype, $blocklength, $(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.rand_cpu($celltype, $blocklength, $(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -304,6 +314,7 @@ function _falses(caller::Module, args...; celldims=nothing, blocklength=nothing, blocklength = determine_blocklength(blocklength, package) if (package == PKG_CUDA) return :(ParallelStencil.ParallelKernel.falses_cuda($celltype, $blocklength, $(args...))) elseif (package == PKG_AMDGPU) return :(ParallelStencil.ParallelKernel.falses_amdgpu($celltype, $blocklength, $(args...))) + elseif (package == PKG_METAL) return :(ParallelStencil.ParallelKernel.falses_metal($celltype, $blocklength, $(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.falses_cpu($celltype, $blocklength, $(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -314,6 +325,7 @@ function _trues(caller::Module, args...; celldims=nothing, blocklength=nothing, blocklength = determine_blocklength(blocklength, package) if (package == PKG_CUDA) return :(ParallelStencil.ParallelKernel.trues_cuda($celltype, $blocklength, $(args...))) elseif (package == PKG_AMDGPU) return :(ParallelStencil.ParallelKernel.trues_amdgpu($celltype, $blocklength, $(args...))) + elseif (package == PKG_METAL) return :(ParallelStencil.ParallelKernel.trues_metal($celltype, $blocklength, $(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.trues_cpu($celltype, $blocklength, $(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -324,6 +336,7 @@ function _fill(caller::Module, args...; eltype=nothing, celldims=nothing, cellty blocklength = determine_blocklength(blocklength, package) if (package == PKG_CUDA) return :(ParallelStencil.ParallelKernel.fill_cuda($celltype, $blocklength, $(args...))) elseif (package == PKG_AMDGPU) return :(ParallelStencil.ParallelKernel.fill_amdgpu($celltype, $blocklength, $(args...))) + elseif (package == PKG_METAL) return :(ParallelStencil.ParallelKernel.fill_metal($celltype, $blocklength, $(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.fill_cpu($celltype, $blocklength, $(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -332,6 +345,7 @@ end function _fill!(caller::Module, args...; package::Symbol=get_package(caller)) if (package == PKG_CUDA) return :(ParallelStencil.ParallelKernel.fill_cuda!($(args...))) elseif (package == PKG_AMDGPU) return :(ParallelStencil.ParallelKernel.fill_amdgpu!($(args...))) + elseif (package == PKG_METAL) return :(ParallelStencil.ParallelKernel.fill_metal!($(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.fill_cpu!($(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end From 8917bd869aa694f6ce759589a644b2493e05f20d Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 14:49:35 +0200 Subject: [PATCH 10/46] Add Metal data module function --- src/ParallelKernel/Data.jl | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/ParallelKernel/Data.jl b/src/ParallelKernel/Data.jl index c1e1833d..2e6b47fe 100644 --- a/src/ParallelKernel/Data.jl +++ b/src/ParallelKernel/Data.jl @@ -227,6 +227,42 @@ function Data_amdgpu(modulename::Symbol, numbertype::DataType, indextype::DataTy return prewalk(rmlines, flatten(Data_module)) end +function Data_metal(modulename::Symbol, numbertype::DataType, indextype::DataType) + Data_module = if (numbertype == NUMBERTYPE_NONE) + :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail. + import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays + const Index = $indextype + const Array{T, N} = Metal.MtlArray{T, N} + const DeviceArray{T, N} = Metal.MtlDeviceArray{T, N} + const Cell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} + const DeviceCell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} + # const CellArray{T_elem, N, B} = CellArrays.MTLCellArray{<:Cell{T_elem},N,B,T_elem} + # const DeviceCellArray{T_elem, N, B} = CellArrays.MTLCellArray{<:DeviceCell{T_elem},N,B,<:Metal.MtlDeviceArray{T_elem,CellArrays._N}} + $(create_shared_exprs(numbertype, indextype)) + end) + else + :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail. + import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays + const Index = $indextype + const Number = $numbertype + const Array{N} = Metal.MtlArray{$numbertype, N} + const DeviceArray{N} = Metal.MtlDeviceArray{$numbertype, N} + const Cell{S} = Union{StaticArrays.SArray{S, $numbertype}, StaticArrays.FieldArray{S, $numbertype}} + const DeviceCell{S} = Union{StaticArrays.SArray{S, $numbertype}, StaticArrays.FieldArray{S, $numbertype}} + # const CellArray{N, B} = CellArrays.MTLCellArray{<:Cell,N,B,$numbertype} + # const DeviceCellArray{N, B} = CellArrays.MTLCellArray{<:DeviceCell,N,B,<:Metal.MtlDeviceArray{$numbertype,CellArrays._N}} + const TArray{T, N} = Metal.MtlArray{T, N} + const DeviceTArray{T, N} = Metal.MtlDeviceArray{T, N} + const TCell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} + const DeviceTCell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} + # const TCellArray{T_elem, N, B} = CellArrays.MTLCellArray{<:TCell{T_elem},N,B,T_elem} + # const DeviceTCellArray{T_elem, N, B} = CellArrays.MTLCellArray{<:DeviceTCell{T_elem},N,B,<:Metal.MtlDeviceArray{T_elem,CellArrays._N}} + $(create_shared_exprs(numbertype, indextype)) + end) + end + return prewalk(rmlines, flatten(Data_module)) +end + function Data_cpu(modulename::Symbol, numbertype::DataType, indextype::DataType) Data_module = if (numbertype == NUMBERTYPE_NONE) :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail. From 0b716e2c5e8e865b67dd7da9df8b70ded6925594 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 14:54:20 +0200 Subject: [PATCH 11/46] Add function to get Metal streams (queues) in hide comm --- src/ParallelKernel/hide_communication.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ParallelKernel/hide_communication.jl b/src/ParallelKernel/hide_communication.jl index 2360cc27..25bc8fbe 100644 --- a/src/ParallelKernel/hide_communication.jl +++ b/src/ParallelKernel/hide_communication.jl @@ -121,6 +121,7 @@ end function get_priority_stream(caller::Module, args::Union{Integer,Symbol,Expr}...; package::Symbol=get_package(caller)) if (package == PKG_CUDA) get_priority_stream_cuda(args...) elseif (package == PKG_AMDGPU) get_priority_stream_amdgpu(args...) + elseif (package == PKG_METAL) get_priority_stream_metal(args...) else @ArgumentError("unsupported GPU package (obtained: $package).") end end @@ -128,6 +129,7 @@ end function get_stream(caller::Module, args::Union{Integer,Symbol,Expr}...; package::Symbol=get_package(caller)) if (package == PKG_CUDA) get_stream_cuda(args...) elseif (package == PKG_AMDGPU) get_stream_amdgpu(args...) + elseif (package == PKG_METAL) get_stream_metal(args...) else @ArgumentError("unsupported GPU package (obtained: $package).") end end @@ -222,8 +224,10 @@ end get_priority_stream_cuda(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_priority_custream($id)) get_priority_stream_amdgpu(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_priority_rocstream($id)) +get_priority_stream_metal(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_priority_metalqueue($id)) get_stream_cuda(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_custream($id)) get_stream_amdgpu(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_rocstream($id)) +get_stream_metal(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_metalqueue($id)) ## FUNCTIONS TO EXTRACT AND PROCESS COMPUTATION AND BOUNDARY CONDITIONS CALLS / COMMUNICATION CALLS From b69ad03002e346c0f5b051016f076ba37f0a30cb Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 14:56:34 +0200 Subject: [PATCH 12/46] Add Metal to init parallel kernel --- src/ParallelKernel/init_parallel_kernel.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ParallelKernel/init_parallel_kernel.jl b/src/ParallelKernel/init_parallel_kernel.jl index d70a1b26..a13825ac 100644 --- a/src/ParallelKernel/init_parallel_kernel.jl +++ b/src/ParallelKernel/init_parallel_kernel.jl @@ -4,7 +4,7 @@ Initialize the package ParallelKernel, giving access to its main functionality. Creates a module `Data` in the module where `@init_parallel_kernel` is called from. The module `Data` contains the types as `Data.Number`, `Data.Array` and `Data.CellArray` (type `?Data` *after* calling `@init_parallel_kernel` to see the full description of the module). # Arguments -- `package::Module`: the package used for parallelization (CUDA or AMDGPU for GPU, or Threads or Polyester for CPU). +- `package::Module`: the package used for parallelization (CUDA or AMDGPU or Metal for GPU, or Threads or Polyester for CPU). - `numbertype::DataType`: the type of numbers used by @zeros, @ones, @rand and @fill and in all array types of module `Data` (e.g. Float32 or Float64). It is contained in `Data.Number` after @init_parallel_kernel. - `inbounds::Bool=false`: whether to apply `@inbounds` to the kernels by default (overwritable in each kernel definition). @@ -35,6 +35,10 @@ function init_parallel_kernel(caller::Module, package::Symbol, numbertype::DataT if (isinteractive() && !is_installed("AMDGPU")) @NotInstalledError("AMDGPU was selected as package for parallelization, but AMDGPU.jl is not installed. AMDGPU functionality is provided as an extension of $parent_module and AMDGPU.jl needs therefore to be installed independently (type `add AMDGPU` in the julia package manager).") end indextype = INT_AMDGPU data_module = Data_amdgpu(modulename, numbertype, indextype) + elseif package == PKG_METAL + if (isinteractive() && !is_installed("Metal")) @NotInstalledError("Metal was selected as package for parallelization, but Metal.jl is not installed. Metal functionality is provided as an extension of $parent_module and Metal.jl needs therefore to be installed independently (type `add Metal` in the julia package manager).") end + indextype = INT_METAL + data_module = Data_metal(modulename, numbertype, indextype) elseif package == PKG_POLYESTER if (isinteractive() && !is_installed("Polyester")) @NotInstalledError("Polyester was selected as package for parallelization, but Polyester.jl is not installed. Multi-threading using Polyester is provided as an extension of $parent_module and Polyester.jl needs therefore to be installed independently (type `add Polyester` in the julia package manager).") end indextype = INT_POLYESTER From 0d3fdc92bb43be03615da1101c7c1932bac1eb0f Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 17:25:07 +0200 Subject: [PATCH 13/46] Implement Metal specific kernel language functions --- src/ParallelKernel/kernel_language.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/ParallelKernel/kernel_language.jl b/src/ParallelKernel/kernel_language.jl index a714a95a..ca3b977f 100644 --- a/src/ParallelKernel/kernel_language.jl +++ b/src/ParallelKernel/kernel_language.jl @@ -172,6 +172,7 @@ end function gridDim(caller::Module, args...; package::Symbol=get_package(caller)) if (package == PKG_CUDA) return :(CUDA.gridDim($(args...))) elseif (package == PKG_AMDGPU) return :(AMDGPU.gridGroupDim($(args...))) + elseif (package == PKG_METAL) return :(Metal.threadgroups_per_grid_3d($(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.@gridDim_cpu($(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -180,6 +181,7 @@ end function blockIdx(caller::Module, args...; package::Symbol=get_package(caller)) #NOTE: the CPU implementation relies on the fact that ranges are always of type UnitRange. If this changes, then this function needs to be adapted. if (package == PKG_CUDA) return :(CUDA.blockIdx($(args...))) elseif (package == PKG_AMDGPU) return :(AMDGPU.workgroupIdx($(args...))) + elseif (package == PKG_METAL) return :(Metal.threadgroup_position_in_grid_3d($(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.@blockIdx_cpu($(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -188,6 +190,7 @@ end function blockDim(caller::Module, args...; package::Symbol=get_package(caller)) #NOTE: the CPU implementation follows the model that no threads are grouped into blocks, i.e. that each block contains only 1 thread (with thread ID 1). The parallelization happens only over the blocks. if (package == PKG_CUDA) return :(CUDA.blockDim($(args...))) elseif (package == PKG_AMDGPU) return :(AMDGPU.workgroupDim($(args...))) + elseif (package == PKG_METAL) return :(Metal.threads_per_threadgroup_3d($(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.@blockDim_cpu($(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -196,6 +199,7 @@ end function threadIdx(caller::Module, args...; package::Symbol=get_package(caller)) #NOTE: the CPU implementation follows the model that no threads are grouped into blocks, i.e. that each block contains only 1 thread (with thread ID 1). The parallelization happens only over the blocks. if (package == PKG_CUDA) return :(CUDA.threadIdx($(args...))) elseif (package == PKG_AMDGPU) return :(AMDGPU.workitemIdx($(args...))) + elseif (package == PKG_METAL) return :(Metal.thread_position_in_threadgroup_3d($(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.@threadIdx_cpu($(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -207,6 +211,7 @@ end function sync_threads(caller::Module, args...; package::Symbol=get_package(caller)) #NOTE: the CPU implementation follows the model that no threads are grouped into blocks, i.e. that each block contains only 1 thread (with thread ID 1). The parallelization happens only over the blocks. Synchronization within a block is therefore not needed (as it contains only one thread). if (package == PKG_CUDA) return :(CUDA.sync_threads($(args...))) elseif (package == PKG_AMDGPU) return :(AMDGPU.sync_workgroup($(args...))) + elseif (package == PKG_METAL) return :(Metal.threadgroup_barrier($(args...); flag=Metal.MemoryFlagThreadGroup)) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.@sync_threads_cpu($(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -218,6 +223,7 @@ end function sharedMem(caller::Module, args...; package::Symbol=get_package(caller)) if (package == PKG_CUDA) return :(CUDA.@cuDynamicSharedMem($(args...))) elseif (package == PKG_AMDGPU) return :(ParallelStencil.ParallelKernel.@sharedMem_amdgpu($(args...))) + elseif (package == PKG_METAL) return :(ParallelStencil.ParallelKernel.@sharedMem_metal($(args...))) elseif iscpu(package) return :(ParallelStencil.ParallelKernel.@sharedMem_cpu($(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -227,12 +233,16 @@ macro sharedMem_amdgpu(T, dims) esc(:(AMDGPU.@ROCDynamicLocalArray($T, $dims, fa macro sharedMem_amdgpu(T, dims, offset) esc(:(ParallelStencil.ParallelKernel.@sharedMem_amdgpu($T, $dims))) end +macro sharedMem_metal(T, dims) :(Metal.MtlThreadGroupArray($T, $dims)); end + +macro sharedMem_metal(T, dims, offset) esc(:(ParallelStencil.ParallelKernel.@sharedMem_metal($T, $dims))) end ## FUNCTIONS FOR PRINTING function pk_show(caller::Module, args...; package::Symbol=get_package(caller)) if (package == PKG_CUDA) return :(CUDA.@cushow($(args...))) elseif (package == PKG_AMDGPU) @KeywordArgumentError("this functionality is not yet supported in AMDGPU.jl.") + elseif (package == PKG_METAL) @KeywordArgumentError("this functionality is not yet supported in Metal.jl.") elseif iscpu(package) return :(Base.@show($(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end @@ -241,6 +251,7 @@ end function pk_println(caller::Module, args...; package::Symbol=get_package(caller)) if (package == PKG_CUDA) return :(CUDA.@cuprintln($(args...))) elseif (package == PKG_AMDGPU) return :(AMDGPU.@rocprintln($(args...))) + elseif (package == PKG_METAL) @KeywordArgumentError("this functionality is not yet supported in Metal.jl.") elseif iscpu(package) return :(Base.println($(args...))) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end From c6c07d0d448c8649e466ca47cab97038dfee8089 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 8 Oct 2024 18:03:34 +0200 Subject: [PATCH 14/46] Add parallel kernel calls --- src/ParallelKernel/MetalExt/defaults.jl | 4 ++-- src/ParallelKernel/MetalExt/shared.jl | 15 ++++++++------- src/ParallelKernel/ParallelKernel.jl | 1 + src/ParallelKernel/hide_communication.jl | 4 ++-- src/ParallelKernel/parallel.jl | 16 +++++++++++++++- 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/ParallelKernel/MetalExt/defaults.jl b/src/ParallelKernel/MetalExt/defaults.jl index 16750d52..abc3e224 100644 --- a/src/ParallelKernel/MetalExt/defaults.jl +++ b/src/ParallelKernel/MetalExt/defaults.jl @@ -2,8 +2,8 @@ const ERRMSG_METALEXT_NOT_LOADED = "the Metal extension was not loaded. Make sur # shared.jl -function get_priority_mtlstream end -function get_mtlstream end +function get_priority_metalstream end +function get_metalstream end # allocators diff --git a/src/ParallelKernel/MetalExt/shared.jl b/src/ParallelKernel/MetalExt/shared.jl index 8f9587bb..686a51aa 100644 --- a/src/ParallelKernel/MetalExt/shared.jl +++ b/src/ParallelKernel/MetalExt/shared.jl @@ -11,19 +11,20 @@ import Metal.MTL ParallelStencil.ParallelKernel.is_loaded(::Val{:ParallelStencil_MetalExt}) = true ## FUNCTIONS TO GET CREATE AND MANAGE METAL QUEUES -ParallelStencil.ParallelKernel.get_priority_metalqueue(arg...) = get_priority_metalqueue(arg...) -ParallelStencil.ParallelKernel.get_metalqueue(arg...) = get_metalqueue(arg...) +ParallelStencil.ParallelKernel.get_priority_stream(arg...) = get_priority_metalstream(arg...) +ParallelStencil.ParallelKernel.get_metalstream(arg...) = get_metalstream(arg...) + let - global get_priority_metalqueue, get_metalqueue - priority_metalqueues = Array{MTLCommandQueue}(undef, 0) - metalqueues = Array{MTLCommandQueue}(undef, 0) + global get_priority_metalstream, get_metalstream + priority_metalqueues = Array{MTL.MTLCommandQueue}(undef, 0) + metalqueues = Array{MTL.MTLCommandQueue}(undef, 0) - function get_priority_metalqueue(id::Integer) + function get_priority_metalstream(id::Integer) while (id > length(priority_metalqueues)) push!(priority_metalqueues, MTL.MTLCommandQueue(device())) end # No priority setting available in Metal queues. return priority_metalqueues[id] end - function get_metalqueue(id::Integer) + function get_metalstream(id::Integer) while (id > length(metalqueues)) push!(metalqueues, MTL.MTLCommandQueue(device())) end return metalqueues[id] end diff --git a/src/ParallelKernel/ParallelKernel.jl b/src/ParallelKernel/ParallelKernel.jl index 51567db4..345aacd9 100644 --- a/src/ParallelKernel/ParallelKernel.jl +++ b/src/ParallelKernel/ParallelKernel.jl @@ -51,6 +51,7 @@ include("Data.jl"); ## Alphabetical include of defaults for extensions include(joinpath("AMDGPUExt", "defaults.jl")) include(joinpath("CUDAExt", "defaults.jl")) +include(joinpath("MetalExt", "defaults.jl")) ## Include of constant parameters, types and syntax sugar shared in ParallelKernel module only include("shared.jl") diff --git a/src/ParallelKernel/hide_communication.jl b/src/ParallelKernel/hide_communication.jl index 25bc8fbe..0eb58fc6 100644 --- a/src/ParallelKernel/hide_communication.jl +++ b/src/ParallelKernel/hide_communication.jl @@ -224,10 +224,10 @@ end get_priority_stream_cuda(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_priority_custream($id)) get_priority_stream_amdgpu(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_priority_rocstream($id)) -get_priority_stream_metal(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_priority_metalqueue($id)) +get_priority_stream_metal(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_priority_metalstream($id)) get_stream_cuda(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_custream($id)) get_stream_amdgpu(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_rocstream($id)) -get_stream_metal(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_metalqueue($id)) +get_stream_metal(id::Union{Integer,Symbol,Expr}) = return :(ParallelStencil.ParallelKernel.get_metalstream($id)) ## FUNCTIONS TO EXTRACT AND PROCESS COMPUTATION AND BOUNDARY CONDITIONS CALLS / COMMUNICATION CALLS diff --git a/src/ParallelKernel/parallel.jl b/src/ParallelKernel/parallel.jl index 46c991b3..0e24b96b 100644 --- a/src/ParallelKernel/parallel.jl +++ b/src/ParallelKernel/parallel.jl @@ -90,18 +90,22 @@ macro synchronize(args...) check_initialized(__module__); esc(synchronize(__modu macro parallel_cuda(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_CUDA)); end macro parallel_amdgpu(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_AMDGPU)); end +macro parallel_metal(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_METAL)); end macro parallel_threads(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_THREADS)); end macro parallel_polyester(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_POLYESTER)); end macro parallel_indices_cuda(args...) check_initialized(__module__); checkargs_parallel_indices(args...); esc(parallel_indices(__module__, args...; package=PKG_CUDA)); end macro parallel_indices_amdgpu(args...) check_initialized(__module__); checkargs_parallel_indices(args...); esc(parallel_indices(__module__, args...; package=PKG_AMDGPU)); end +macro parallel_indices_metal(args...) check_initialized(__module__); checkargs_parallel_indices(args...); esc(parallel_indices(__module__, args...; package=PKG_METAL)); end macro parallel_indices_threads(args...) check_initialized(__module__); checkargs_parallel_indices(args...); esc(parallel_indices(__module__, args...; package=PKG_THREADS)); end macro parallel_indices_polyester(args...) check_initialized(__module__); checkargs_parallel_indices(args...); esc(parallel_indices(__module__, args...; package=PKG_POLYESTER)); end macro parallel_async_cuda(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_CUDA)); end macro parallel_async_amdgpu(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_AMDGPU)); end +macro parallel_async_metal(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_METAL)); end macro parallel_async_threads(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_THREADS)); end macro parallel_async_polyester(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_POLYESTER)); end macro synchronize_cuda(args...) check_initialized(__module__); esc(synchronize(__module__, args...; package=PKG_CUDA)); end macro synchronize_amdgpu(args...) check_initialized(__module__); esc(synchronize(__module__, args...; package=PKG_AMDGPU)); end +macro synchronize_metal(args...) check_initialized(__module__); esc(synchronize(__module__, args...; package=PKG_METAL)); end macro synchronize_threads(args...) check_initialized(__module__); esc(synchronize(__module__, args...; package=PKG_THREADS)); end macro synchronize_polyester(args...) check_initialized(__module__); esc(synchronize(__module__, args...; package=PKG_POLYESTER)); end @@ -158,6 +162,7 @@ end function synchronize(caller::Module, args::Union{Symbol,Expr}...; package::Symbol=get_package(caller)) if (package == PKG_CUDA) synchronize_cuda(args...) elseif (package == PKG_AMDGPU) synchronize_amdgpu(args...) + elseif (package == PKG_METAL) synchronize_metal(args...) elseif (package == PKG_THREADS) synchronize_threads(args...) elseif (package == PKG_POLYESTER) synchronize_polyester(args...) else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") @@ -236,6 +241,7 @@ function parallel_call_gpu(ranges::Union{Symbol,Expr}, nblocks::Union{Symbol,Exp ranges = :(ParallelStencil.ParallelKernel.promote_ranges($ranges)) if (package == PKG_CUDA) int_type = INT_CUDA elseif (package == PKG_AMDGPU) int_type = INT_AMDGPU + elseif (package == PKG_METAL) int_type = INT_METAL end push!(kernelcall.args, ranges) #TODO: to enable indexing with other then Int64 something like the following but probably better in a function will also be necessary: push!(kernelcall.args, :(convert(Tuple{UnitRange{$int_type},UnitRange{$int_type},UnitRange{$int_type}}, $ranges))) push!(kernelcall.args, :($int_type(length($ranges[1])))) @@ -304,6 +310,7 @@ end synchronize_cuda(args::Union{Symbol,Expr}...) = :(CUDA.synchronize($(args...); blocking=true)) synchronize_amdgpu(args::Union{Symbol,Expr}...) = :(AMDGPU.synchronize($(args...); blocking=true)) +synchronize_metal(args::Union{Symbol,Expr}...) = :(Metal.synchronize($(args...))) synchronize_threads(args::Union{Symbol,Expr}...) = :(begin end) synchronize_polyester(args::Union{Symbol,Expr}...) = :(begin end) @@ -559,17 +566,22 @@ function create_gpu_call(package::Symbol, nblocks::Union{Symbol,Expr}, nthreads: if !isnothing(shmem) if (package == PKG_CUDA) shmem_expr = :(shmem = $shmem) elseif (package == PKG_AMDGPU) shmem_expr = :(shmem = $shmem) + elseif (package == PKG_METAL) shmem_expr = nothing # No need to pass shared memory to Metal kernels. else @ModuleInternalError("unsupported GPU package (obtained: $package).") end - backend_kwargs_expr = (backend_kwargs_expr..., shmem_expr) + if package != PKG_METAL + backend_kwargs_expr = (backend_kwargs_expr..., shmem_expr) + end end if (package == PKG_CUDA) return :( CUDA.@cuda blocks=$nblocks threads=$nthreads stream=$stream $(backend_kwargs_expr...) $kernelcall; $synccall ) elseif (package == PKG_AMDGPU) return :( AMDGPU.@roc gridsize=$nblocks groupsize=$nthreads stream=$stream $(backend_kwargs_expr...) $kernelcall; $synccall ) + elseif (package == PKG_METAL) return :( Metal.@metal groups=$nblocks threads=$nthreads queue=$stream $(backend_kwargs_expr...) $kernelcall; $synccall ) else @ModuleInternalError("unsupported GPU package (obtained: $package).") end else if (package == PKG_CUDA) return :( CUDA.@cuda launch=false $(backend_kwargs_expr...) $kernelcall) # NOTE: runtime arguments must be omitted when the kernel is not launched (backend_kwargs_expr must not contain any around time argument) elseif (package == PKG_AMDGPU) return :( AMDGPU.@roc launch=false $(backend_kwargs_expr...) $kernelcall) # NOTE: ... + elseif (package == PKG_METAL) return :( Metal.@metal launch=false $(backend_kwargs_expr...) $kernelcall) # NOTE: ... else @ModuleInternalError("unsupported GPU package (obtained: $package).") end end @@ -578,6 +590,7 @@ end function create_synccall(package::Symbol, stream::Union{Symbol,Expr}) if (package == PKG_CUDA) synchronize_cuda(stream) elseif (package == PKG_AMDGPU) synchronize_amdgpu(stream) + elseif (package == PKG_METAL) synchronize_metal(stream) else @ModuleInternalError("unsupported GPU package (obtained: $package).") end end @@ -585,6 +598,7 @@ end function default_stream(package) if (package == PKG_CUDA) return :(CUDA.stream()) # Use the default stream of the task. elseif (package == PKG_AMDGPU) return :(AMDGPU.stream()) # Use the default stream of the task. + elseif (package == PKG_METAL) return :(Metal.global_queue(device())) # Use the default queue of the task. else @ModuleInternalError("unsupported GPU package (obtained: $package).") end end \ No newline at end of file From f00838eec53e9a2e9e2734d6dc2bfb8420c465f3 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 9 Oct 2024 09:13:21 +0200 Subject: [PATCH 15/46] Add Metal to shared --- src/ParallelKernel/shared.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ParallelKernel/shared.jl b/src/ParallelKernel/shared.jl index d6bf9efb..a2deb486 100644 --- a/src/ParallelKernel/shared.jl +++ b/src/ParallelKernel/shared.jl @@ -60,6 +60,7 @@ const ERRMSG_CHECK_LITERALTYPES = "the type given to 'literaltype' must be on const CELLARRAY_BLOCKLENGTH = Dict(PKG_NONE => 0, PKG_CUDA => 0, PKG_AMDGPU => 0, + PKG_METAL => 0, PKG_THREADS => 1, PKG_POLYESTER => 1) @@ -463,7 +464,7 @@ end ## FUNCTIONS/MACROS FOR DIVERSE SYNTAX SUGAR iscpu(package) = return (package in (PKG_THREADS, PKG_POLYESTER)) -isgpu(package) = return (package in (PKG_CUDA, PKG_AMDGPU)) +isgpu(package) = return (package in (PKG_CUDA, PKG_AMDGPU, PKG_METAL)) ## TEMPORARY FUNCTION DEFINITIONS TO BE MERGED IN MACROTOOLS (https://github.com/FluxML/MacroTools.jl/pull/173) From 740ced723f5897623dde2d0cbaf31ddb71c3c0ef Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 9 Oct 2024 09:39:20 +0200 Subject: [PATCH 16/46] Add Metal to PS and tests --- src/kernel_language.jl | 1 + src/parallel.jl | 5 ++++- src/shared.jl | 3 ++- test/runtests.jl | 7 ++++++- test/test_FiniteDifferences1D.jl | 4 ++++ test/test_FiniteDifferences2D.jl | 4 ++++ test/test_FiniteDifferences3D.jl | 4 ++++ test/test_extensions.jl | 6 +++++- test/test_incremental_compilation.jl | 4 ++++ test/test_init_parallel_stencil.jl | 4 ++++ test/test_parallel.jl | 4 ++++ test/test_reset_parallel_stencil.jl | 4 ++++ 12 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/kernel_language.jl b/src/kernel_language.jl index 92d59e7a..6c7e4dd2 100644 --- a/src/kernel_language.jl +++ b/src/kernel_language.jl @@ -71,6 +71,7 @@ function memopt(metadata_module::Module, is_parallel_kernel::Bool, caller::Modul if (package ∉ SUPPORTED_PACKAGES) @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).") end if (package == PKG_CUDA) int_type = INT_CUDA elseif (package == PKG_AMDGPU) int_type = INT_AMDGPU + elseif (package == PKG_METAL) int_type = INT_METAL elseif (package == PKG_THREADS) int_type = INT_THREADS end body = eval_offsets(caller, body, indices, int_type) diff --git a/src/parallel.jl b/src/parallel.jl index fd52b1cc..27a2a86b 100644 --- a/src/parallel.jl +++ b/src/parallel.jl @@ -86,14 +86,17 @@ macro parallel_async(args...) check_initialized(__module__); checkargs_parallel( macro parallel_cuda(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_CUDA)); end macro parallel_amdgpu(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_AMDGPU)); end +macro parallel_metal(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_METAL)); end macro parallel_threads(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_THREADS)); end macro parallel_polyester(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_POLYESTER)); end macro parallel_indices_cuda(args...) check_initialized(__module__); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_CUDA)); end macro parallel_indices_amdgpu(args...) check_initialized(__module__); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_AMDGPU)); end +macro parallel_indices_metal(args...) check_initialized(__module__); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_METAL)); end macro parallel_indices_threads(args...) check_initialized(__module__); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_THREADS)); end macro parallel_indices_polyester(args...) check_initialized(__module__); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_POLYESTER)); end macro parallel_async_cuda(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_CUDA)); end macro parallel_async_amdgpu(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_AMDGPU)); end +macro parallel_async_metal(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_METAL)); end macro parallel_async_threads(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_THREADS)); end macro parallel_async_polyester(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_POLYESTER)); end @@ -350,7 +353,7 @@ end ## FUNCTIONS TO DETERMINE OPTIMIZATION PARAMETERS -determine_nthreads_max_memopt(package::Symbol) = (package == PKG_AMDGPU) ? NTHREADS_MAX_MEMOPT_AMDGPU : NTHREADS_MAX_MEMOPT_CUDA +determine_nthreads_max_memopt(package::Symbol) = (package == PKG_AMDGPU) ? NTHREADS_MAX_MEMOPT_AMDGPU : ((package == PKG_CUDA) ? NTHREADS_MAX_MEMOPT_CUDA : NTHREADS_MAX_MEMOPT_METAL) determine_loopdim(indices::Union{Symbol,Expr}) = isa(indices,Expr) && (length(indices.args)==3) ? 3 : LOOPDIM_NONE # TODO: currently only loopdim=3 is supported. compute_loopsize() = LOOPSIZE diff --git a/src/shared.jl b/src/shared.jl index 9f47b7c0..5b647da1 100644 --- a/src/shared.jl +++ b/src/shared.jl @@ -1,6 +1,6 @@ import MacroTools: @capture, postwalk, splitdef, splitarg # NOTE: inexpr_walk used instead of MacroTools.inexpr import .ParallelKernel: eval_arg, split_args, split_kwargs, extract_posargs_init, extract_kernel_args, insert_device_types, is_kernel, is_call, gensym_world, isgpu, iscpu, @isgpu, @iscpu, substitute, substitute_in_kernel, in_signature, inexpr_walk, adjust_signatures, handle_indices_and_literals, add_inbounds, cast, @ranges, @rangelengths, @return_value, @return_nothing -import .ParallelKernel: PKG_CUDA, PKG_AMDGPU, PKG_THREADS, PKG_POLYESTER, PKG_NONE, NUMBERTYPE_NONE, SUPPORTED_NUMBERTYPES, SUPPORTED_PACKAGES, ERRMSG_UNSUPPORTED_PACKAGE, INT_CUDA, INT_AMDGPU, INT_POLYESTER, INT_THREADS, INDICES, PKNumber, RANGES_VARNAME, RANGES_TYPE, RANGELENGTH_XYZ_TYPE, RANGELENGTHS_VARNAMES, THREADIDS_VARNAMES, GENSYM_SEPARATOR, AD_SUPPORTED_ANNOTATIONS +import .ParallelKernel: PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_THREADS, PKG_POLYESTER, PKG_NONE, NUMBERTYPE_NONE, SUPPORTED_NUMBERTYPES, SUPPORTED_PACKAGES, ERRMSG_UNSUPPORTED_PACKAGE, INT_CUDA, INT_AMDGPU, INT_METAL, INT_POLYESTER, INT_THREADS, INDICES, PKNumber, RANGES_VARNAME, RANGES_TYPE, RANGELENGTH_XYZ_TYPE, RANGELENGTHS_VARNAMES, THREADIDS_VARNAMES, GENSYM_SEPARATOR, AD_SUPPORTED_ANNOTATIONS import .ParallelKernel: @require, @symbols, symbols, longnameof, @prettyexpand, @prettystring, prettystring, @gorgeousexpand, @gorgeousstring, gorgeousstring @@ -25,6 +25,7 @@ const LOOPSIZE = 16 const LOOPDIM_NONE = 0 const NTHREADS_MAX_MEMOPT_CUDA = 128 const NTHREADS_MAX_MEMOPT_AMDGPU = 256 +const NTHREADS_MAX_MEMOPT_METAL = 256 const USE_SHMEMHALO_DEFAULT = true const USE_SHMEMHALO_1D_DEFAULT = true const USE_FULLRANGE_DEFAULT = (false, false, true) diff --git a/test/runtests.jl b/test/runtests.jl index 85ba20e5..987a96bc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,9 +2,10 @@ push!(LOAD_PATH, "../src") import ParallelStencil # Precompile it. -import ParallelStencil: SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU +import ParallelStencil: SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL @static if (PKG_CUDA in SUPPORTED_PACKAGES) import CUDA end @static if (PKG_AMDGPU in SUPPORTED_PACKAGES) import AMDGPU end +@static if (PKG_METAL in SUPPORTED_PACKAGES) import Metal end excludedfiles = [ "test_excluded.jl", "test_incremental_compilation.jl"]; # TODO: test_incremental_compilation has to be deactivated until Polyester support released @@ -25,6 +26,10 @@ function runtests() @warn "Test Skip: All AMDGPU tests will be skipped because AMDGPU is not functional (if this is unexpected type `import AMDGPU; AMDGPU.functional()` to debug your AMDGPU installation)." end + if (PKG_METAL in SUPPORTED_PACKAGES && !Metal.functional()) + @warn "Test Skip: All Metal tests will be skipped because Metal is not functional (if this is unexpected type `import Metal; Metal.functional()` to debug your Metal installation)." + end + for f in testfiles println("") if basename(f) ∈ excludedfiles diff --git a/test/test_FiniteDifferences1D.jl b/test/test_FiniteDifferences1D.jl index bd058592..59578674 100644 --- a/test/test_FiniteDifferences1D.jl +++ b/test/test_FiniteDifferences1D.jl @@ -12,6 +12,10 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( diff --git a/test/test_FiniteDifferences2D.jl b/test/test_FiniteDifferences2D.jl index e836f3a8..539cb365 100644 --- a/test/test_FiniteDifferences2D.jl +++ b/test/test_FiniteDifferences2D.jl @@ -12,6 +12,10 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( diff --git a/test/test_FiniteDifferences3D.jl b/test/test_FiniteDifferences3D.jl index 056ffae0..2c0154b6 100644 --- a/test/test_FiniteDifferences3D.jl +++ b/test/test_FiniteDifferences3D.jl @@ -12,6 +12,10 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( diff --git a/test/test_extensions.jl b/test/test_extensions.jl index b76d5962..cd929f4c 100644 --- a/test/test_extensions.jl +++ b/test/test_extensions.jl @@ -1,5 +1,5 @@ using Test -import ParallelStencil: SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_POLYESTER +import ParallelStencil: SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_POLYESTER TEST_PACKAGES = SUPPORTED_PACKAGES TEST_PACKAGES = filter!(x->x≠PKG_POLYESTER, TEST_PACKAGES) # NOTE: Polyester is not tested here, because the CPU case is sufficiently covered by the test of the Threads package. @static if PKG_CUDA in TEST_PACKAGES @@ -10,6 +10,10 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end exename = joinpath(Sys.BINDIR, Base.julia_exename()) const TEST_PROJECTS = ["Diffusion3D_minimal"] # ["Diffusion3D_minimal", "Diffusion3D", "Diffusion"] diff --git a/test/test_incremental_compilation.jl b/test/test_incremental_compilation.jl index 5982dac8..7a02acea 100644 --- a/test/test_incremental_compilation.jl +++ b/test/test_incremental_compilation.jl @@ -9,6 +9,10 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester end diff --git a/test/test_init_parallel_stencil.jl b/test/test_init_parallel_stencil.jl index b77a8ff5..2370fd65 100644 --- a/test/test_init_parallel_stencil.jl +++ b/test/test_init_parallel_stencil.jl @@ -13,6 +13,10 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( diff --git a/test/test_parallel.jl b/test/test_parallel.jl index b4d6e2f7..59ae434d 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -15,6 +15,10 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. import ParallelStencil.@gorgeousexpand diff --git a/test/test_reset_parallel_stencil.jl b/test/test_reset_parallel_stencil.jl index 481e6b52..870f46c3 100644 --- a/test/test_reset_parallel_stencil.jl +++ b/test/test_reset_parallel_stencil.jl @@ -11,6 +11,10 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( From 453e64ed3df1c874e57cfb010c76176c43319983 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 9 Oct 2024 13:38:01 +0200 Subject: [PATCH 17/46] WIP tests and fix compatibility issue (bump Metal to v1.0) --- Project.toml | 3 +- src/FiniteDifferences.jl | 78 +++++----- src/ParallelKernel/Data.jl | 18 ++- src/ParallelKernel/MetalExt/allocators.jl | 20 +-- src/ParallelKernel/MetalExt/shared.jl | 6 +- src/ParallelKernel/parallel.jl | 2 +- src/ParallelKernel/shared.jl | 2 +- src/kernel_language.jl | 4 +- test/ParallelKernel/test_allocators.jl | 83 +++++++++- .../ParallelKernel/test_hide_communication.jl | 29 ++-- .../test_init_parallel_kernel.jl | 9 +- test/ParallelKernel/test_kernel_language.jl | 32 +++- test/ParallelKernel/test_parallel.jl | 75 +++++---- .../test_reset_parallel_kernel.jl | 9 +- test/test_FiniteDifferences1D.jl | 42 ++--- test/test_FiniteDifferences2D.jl | 73 ++++----- test/test_FiniteDifferences3D.jl | 120 ++++++++------- test/test_extensions.jl | 3 + test/test_incremental_compilation.jl | 2 +- test/test_init_parallel_stencil.jl | 5 +- test/test_parallel.jl | 143 +++++++++++------- .../test/localtest_diffusion_Metal.jl | 8 + test/test_reset_parallel_stencil.jl | 5 +- 23 files changed, 499 insertions(+), 272 deletions(-) create mode 100644 test/test_projects/Diffusion3D_minimal/test/localtest_diffusion_Metal.jl diff --git a/Project.toml b/Project.toml index 38bfef7f..e7036d24 100644 --- a/Project.toml +++ b/Project.toml @@ -28,6 +28,7 @@ CUDA = "3.12, 4, 5" CellArrays = "0.2.1" Enzyme = "0.11" MacroTools = "0.5" +Metal = "1.0" Polyester = "0.7" StaticArrays = "1" julia = "1.9" # Minimum version supporting extensions @@ -37,4 +38,4 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "TOML", "AMDGPU", "CUDA", "Enzyme", "Polyester"] +test = ["Test", "TOML", "AMDGPU", "CUDA", "Metal", "Enzyme", "Polyester"] diff --git a/src/FiniteDifferences.jl b/src/FiniteDifferences.jl index a5266c98..584e92dd 100644 --- a/src/FiniteDifferences.jl +++ b/src/FiniteDifferences.jl @@ -54,8 +54,8 @@ macro d(A) @expandargs(A); esc(:( $A[$ix+1] - $A[$ix] )) end macro d2(A) @expandargs(A); esc(:( ($A[$ixi+1] - $A[$ixi]) - ($A[$ixi] - $A[$ixi-1]) )) end macro all(A) @expandargs(A); esc(:( $A[$ix ] )) end macro inn(A) @expandargs(A); esc(:( $A[$ixi ] )) end -macro av(A) @expandargs(A); esc(:(($A[$ix] + $A[$ix+1] )*0.5 )) end -macro harm(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix] + 1.0/$A[$ix+1])*2.0 )) end +macro av(A) @expandargs(A); esc(:(($A[$ix] + $A[$ix+1] )/2 )) end +macro harm(A) @expandargs(A); esc(:(1/(1/$A[$ix] + 1/$A[$ix+1])*2 )) end macro maxloc(A) @expandargs(A); esc(:( max( max($A[$ixi-1], $A[$ixi+1]), $A[$ixi] ) )) end macro minloc(A) @expandargs(A); esc(:( min( min($A[$ixi-1], $A[$ixi+1]), $A[$ixi] ) )) end @@ -168,15 +168,15 @@ macro inn(A) @expandargs(A); esc(:( $A[$ixi ,$iyi ] )) end macro inn_x(A) @expandargs(A); esc(:( $A[$ixi ,$iy ] )) end macro inn_y(A) @expandargs(A); esc(:( $A[$ix ,$iyi ] )) end macro av(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] + $A[$ix,$iy+1] + $A[$ix+1,$iy+1])*0.25 )) end -macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] )*0.5 )) end -macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix ,$iy+1] )*0.5 )) end -macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ] + $A[$ix+1,$iyi ] )*0.5 )) end -macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ] + $A[$ixi ,$iy+1] )*0.5 )) end -macro harm(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ] + 1.0/$A[$ix+1,$iy ] + 1.0/$A[$ix,$iy+1] + 1.0/$A[$ix+1,$iy+1])*4.0 )) end -macro harm_xa(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ] + 1.0/$A[$ix+1,$iy ] )*2.0 )) end -macro harm_ya(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ] + 1.0/$A[$ix ,$iy+1] )*2.0 )) end -macro harm_xi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iyi ] + 1.0/$A[$ix+1,$iyi ] )*2.0 )) end -macro harm_yi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iy ] + 1.0/$A[$ixi ,$iy+1] )*2.0 )) end +macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] )/2 )) end +macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix ,$iy+1] )/2 )) end +macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ] + $A[$ix+1,$iyi ] )/2 )) end +macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ] + $A[$ixi ,$iy+1] )/2 )) end +macro harm(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ] + 1/$A[$ix+1,$iy ] + 1/$A[$ix,$iy+1] + 1/$A[$ix+1,$iy+1])*4.0 )) end +macro harm_xa(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ] + 1/$A[$ix+1,$iy ] )*2 )) end +macro harm_ya(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ] + 1/$A[$ix ,$iy+1] )*2 )) end +macro harm_xi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iyi ] + 1/$A[$ix+1,$iyi ] )*2 )) end +macro harm_yi(A) @expandargs(A); esc(:(1/(1/$A[$ixi ,$iy ] + 1/$A[$ixi ,$iy+1] )*2 )) end macro maxloc(A) @expandargs(A); esc(:( max( max( max($A[$ixi-1,$iyi ], $A[$ixi+1,$iyi ]) , $A[$ixi ,$iyi ] ), max($A[$ixi ,$iyi-1], $A[$ixi ,$iyi+1]) ) )) end macro minloc(A) @expandargs(A); esc(:( min( min( min($A[$ixi-1,$iyi ], $A[$ixi+1,$iyi ]) , $A[$ixi ,$iyi ] ), @@ -343,12 +343,12 @@ macro av(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy $A[$ix+1,$iy+1,$iz ] + $A[$ix+1,$iy+1,$iz+1] + $A[$ix ,$iy+1,$iz+1] + $A[$ix ,$iy ,$iz+1] + $A[$ix+1,$iy ,$iz+1] + $A[$ix ,$iy+1,$iz ] )*0.125)) end -macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] )*0.5 )) end -macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy+1,$iz ] )*0.5 )) end -macro av_za(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy ,$iz+1] )*0.5 )) end -macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ,$izi ] + $A[$ix+1,$iyi ,$izi ] )*0.5 )) end -macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ,$izi ] + $A[$ixi ,$iy+1,$izi ] )*0.5 )) end -macro av_zi(A) @expandargs(A); esc(:(($A[$ixi ,$iyi ,$iz ] + $A[$ixi ,$iyi ,$iz+1] )*0.5 )) end +macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] )/2 )) end +macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy+1,$iz ] )/2 )) end +macro av_za(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy ,$iz+1] )/2 )) end +macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ,$izi ] + $A[$ix+1,$iyi ,$izi ] )/2 )) end +macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ,$izi ] + $A[$ixi ,$iy+1,$izi ] )/2 )) end +macro av_zi(A) @expandargs(A); esc(:(($A[$ixi ,$iyi ,$iz ] + $A[$ixi ,$iyi ,$iz+1] )/2 )) end macro av_xya(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] + $A[$ix ,$iy+1,$iz ] + $A[$ix+1,$iy+1,$iz ] )*0.25 )) end macro av_xza(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] + @@ -361,28 +361,28 @@ macro av_xzi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ,$iz ] + $A[$ix+1,$iyi $A[$ix ,$iyi ,$iz+1] + $A[$ix+1,$iyi ,$iz+1] )*0.25 )) end macro av_yzi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ,$iz ] + $A[$ixi ,$iy+1,$iz ] + $A[$ixi ,$iy ,$iz+1] + $A[$ixi ,$iy+1,$iz+1] )*0.25 )) end -macro harm(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] + - 1.0/$A[$ix+1,$iy+1,$iz ] + 1.0/$A[$ix+1,$iy+1,$iz+1] + - 1.0/$A[$ix ,$iy+1,$iz+1] + 1.0/$A[$ix ,$iy ,$iz+1] + - 1.0/$A[$ix+1,$iy ,$iz+1] + 1.0/$A[$ix ,$iy+1,$iz ] )*8.0)) end -macro harm_xa(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] )*2.0 )) end -macro harm_ya(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix ,$iy+1,$iz ] )*2.0 )) end -macro harm_za(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix ,$iy ,$iz+1] )*2.0 )) end -macro harm_xi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iyi ,$izi ] + 1.0/$A[$ix+1,$iyi ,$izi ] )*2.0 )) end -macro harm_yi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iy ,$izi ] + 1.0/$A[$ixi ,$iy+1,$izi ] )*2.0 )) end -macro harm_zi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iyi ,$iz ] + 1.0/$A[$ixi ,$iyi ,$iz+1] )*2.0 )) end -macro harm_xya(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] + - 1.0/$A[$ix ,$iy+1,$iz ] + 1.0/$A[$ix+1,$iy+1,$iz ] )*4.0 )) end -macro harm_xza(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] + - 1.0/$A[$ix ,$iy ,$iz+1] + 1.0/$A[$ix+1,$iy ,$iz+1] )*4.0 )) end -macro harm_yza(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix ,$iy+1,$iz ] + - 1.0/$A[$ix ,$iy ,$iz+1] + 1.0/$A[$ix ,$iy+1,$iz+1] )*4.0 )) end -macro harm_xyi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$izi ] + 1.0/$A[$ix+1,$iy ,$izi ] + - 1.0/$A[$ix ,$iy+1,$izi ] + 1.0/$A[$ix+1,$iy+1,$izi ] )*4.0 )) end -macro harm_xzi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iyi ,$iz ] + 1.0/$A[$ix+1,$iyi ,$iz ] + - 1.0/$A[$ix ,$iyi ,$iz+1] + 1.0/$A[$ix+1,$iyi ,$iz+1] )*4.0 )) end -macro harm_yzi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iy ,$iz ] + 1.0/$A[$ixi ,$iy+1,$iz ] + - 1.0/$A[$ixi ,$iy ,$iz+1] + 1.0/$A[$ixi ,$iy+1,$iz+1] )*4.0 )) end +macro harm(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + + 1/$A[$ix+1,$iy+1,$iz ] + 1/$A[$ix+1,$iy+1,$iz+1] + + 1/$A[$ix ,$iy+1,$iz+1] + 1/$A[$ix ,$iy ,$iz+1] + + 1/$A[$ix+1,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz ] )*8.0)) end +macro harm_xa(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] )*2 )) end +macro harm_ya(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy+1,$iz ] )*2 )) end +macro harm_za(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy ,$iz+1] )*2 )) end +macro harm_xi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iyi ,$izi ] + 1/$A[$ix+1,$iyi ,$izi ] )*2 )) end +macro harm_yi(A) @expandargs(A); esc(:(1/(1/$A[$ixi ,$iy ,$izi ] + 1/$A[$ixi ,$iy+1,$izi ] )*2 )) end +macro harm_zi(A) @expandargs(A); esc(:(1/(1/$A[$ixi ,$iyi ,$iz ] + 1/$A[$ixi ,$iyi ,$iz+1] )*2 )) end +macro harm_xya(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + + 1/$A[$ix ,$iy+1,$iz ] + 1/$A[$ix+1,$iy+1,$iz ] )*4.0 )) end +macro harm_xza(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + + 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix+1,$iy ,$iz+1] )*4.0 )) end +macro harm_yza(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy+1,$iz ] + + 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz+1] )*4.0 )) end +macro harm_xyi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$izi ] + 1/$A[$ix+1,$iy ,$izi ] + + 1/$A[$ix ,$iy+1,$izi ] + 1/$A[$ix+1,$iy+1,$izi ] )*4.0 )) end +macro harm_xzi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iyi ,$iz ] + 1/$A[$ix+1,$iyi ,$iz ] + + 1/$A[$ix ,$iyi ,$iz+1] + 1/$A[$ix+1,$iyi ,$iz+1] )*4.0 )) end +macro harm_yzi(A) @expandargs(A); esc(:(1/(1/$A[$ixi ,$iy ,$iz ] + 1/$A[$ixi ,$iy+1,$iz ] + + 1/$A[$ixi ,$iy ,$iz+1] + 1/$A[$ixi ,$iy+1,$iz+1] )*4.0 )) end macro maxloc(A) @expandargs(A); esc(:( max( max( max( max($A[$ixi-1,$iyi ,$izi ], $A[$ixi+1,$iyi ,$izi ]) , $A[$ixi ,$iyi ,$izi ] ), max($A[$ixi ,$iyi-1,$izi ], $A[$ixi ,$iyi+1,$izi ]) ), max($A[$ixi ,$iyi ,$izi-1], $A[$ixi ,$iyi ,$izi+1]) ) )) end diff --git a/src/ParallelKernel/Data.jl b/src/ParallelKernel/Data.jl index 2e6b47fe..16a72d50 100644 --- a/src/ParallelKernel/Data.jl +++ b/src/ParallelKernel/Data.jl @@ -231,32 +231,38 @@ function Data_metal(modulename::Symbol, numbertype::DataType, indextype::DataTyp Data_module = if (numbertype == NUMBERTYPE_NONE) :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail. import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays + # TODO: the constructors defined by CellArrays.@define_ROCCellArray lead to pre-compilation issues due to a bug in Julia. We therefore only create the type alias here for now. + const MetalCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,Metal.MtlArray{T_elem,CellArrays._N}} + # const Index = $indextype const Array{T, N} = Metal.MtlArray{T, N} const DeviceArray{T, N} = Metal.MtlDeviceArray{T, N} const Cell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} const DeviceCell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} - # const CellArray{T_elem, N, B} = CellArrays.MTLCellArray{<:Cell{T_elem},N,B,T_elem} - # const DeviceCellArray{T_elem, N, B} = CellArrays.MTLCellArray{<:DeviceCell{T_elem},N,B,<:Metal.MtlDeviceArray{T_elem,CellArrays._N}} + const CellArray{T_elem, N, B} = MetalCellArray{<:Cell{T_elem},N,B,T_elem} + const DeviceCellArray{T_elem, N, B} = CellArrays.CellArray{<:DeviceCell{T_elem},N,B,<:Metal.MtlDeviceArray{T_elem,CellArrays._N}} $(create_shared_exprs(numbertype, indextype)) end) else :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail. import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays + # TODO: the constructors defined by CellArrays.@define_ROCCellArray lead to pre-compilation issues due to a bug in Julia. We therefore only create the type alias here for now. + const MetalCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,Metal.MtlArray{T_elem,CellArrays._N}} + # const Index = $indextype const Number = $numbertype const Array{N} = Metal.MtlArray{$numbertype, N} const DeviceArray{N} = Metal.MtlDeviceArray{$numbertype, N} const Cell{S} = Union{StaticArrays.SArray{S, $numbertype}, StaticArrays.FieldArray{S, $numbertype}} const DeviceCell{S} = Union{StaticArrays.SArray{S, $numbertype}, StaticArrays.FieldArray{S, $numbertype}} - # const CellArray{N, B} = CellArrays.MTLCellArray{<:Cell,N,B,$numbertype} - # const DeviceCellArray{N, B} = CellArrays.MTLCellArray{<:DeviceCell,N,B,<:Metal.MtlDeviceArray{$numbertype,CellArrays._N}} + const CellArray{N, B} = MetalCellArray{<:Cell,N,B,$numbertype} + const DeviceCellArray{N, B} = CellArrays.CellArray{<:DeviceCell,N,B,<:Metal.MtlDeviceArray{$numbertype,CellArrays._N}} const TArray{T, N} = Metal.MtlArray{T, N} const DeviceTArray{T, N} = Metal.MtlDeviceArray{T, N} const TCell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} const DeviceTCell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} - # const TCellArray{T_elem, N, B} = CellArrays.MTLCellArray{<:TCell{T_elem},N,B,T_elem} - # const DeviceTCellArray{T_elem, N, B} = CellArrays.MTLCellArray{<:DeviceTCell{T_elem},N,B,<:Metal.MtlDeviceArray{T_elem,CellArrays._N}} + const TCellArray{T_elem, N, B} = MetalCellArray{<:TCell{T_elem},N,B,T_elem} + const DeviceTCellArray{T_elem, N, B} = CellArrays.CellArray{<:DeviceTCell{T_elem},N,B,<:Metal.MtlDeviceArray{T_elem,CellArrays._N}} $(create_shared_exprs(numbertype, indextype)) end) end diff --git a/src/ParallelKernel/MetalExt/allocators.jl b/src/ParallelKernel/MetalExt/allocators.jl index 1d6697ef..e207d9d2 100644 --- a/src/ParallelKernel/MetalExt/allocators.jl +++ b/src/ParallelKernel/MetalExt/allocators.jl @@ -14,16 +14,16 @@ ParallelStencil.ParallelKernel.rand_metal(::Type{T}, blocklength, dims...) where ParallelStencil.ParallelKernel.falses_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = fill_metal(T, blocklength, false, args...) ParallelStencil.ParallelKernel.trues_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = fill_metal(T, blocklength, true, args...) -# function ParallelStencil.ParallelKernel.fill_metal(::Type{T}, ::Val{B}, x, args...) where {T <: Union{SArray,FieldArray}, B} -# if (!(eltype(x) <: Number) || (eltype(x) == Bool)) && (eltype(x) != eltype(T)) @ArgumentError("fill: the (element) type of argument 'x' is not a normal number type ($(eltype(x))), but does not match the obtained (default) 'eltype' ($(eltype(T))); automatic conversion to $(eltype(T)) is therefore not attempted. Set the keyword argument 'eltype' accordingly to the element type of 'x' or pass an 'x' of a different (element) type.") end -# check_datatype_metal(T, Bool, Enum) -# if (length(x) == 1) cell = convert(T, fill(convert(eltype(T), x), size(T))) -# elseif (length(x) == length(T)) cell = convert(T, x) -# else @ArgumentError("fill: argument 'x' contains the wrong number of elements ($(length(x))). It must be a scalar or contain the number of elements defined by 'celldims'.") -# end -# return CellArrays.fill!(MtlCellArray{T,B}(undef, args...), cell) -# end +function ParallelStencil.ParallelKernel.fill_metal(::Type{T}, ::Val{B}, x, args...) where {T <: Union{SArray,FieldArray}, B} + if (!(eltype(x) <: Number) || (eltype(x) == Bool)) && (eltype(x) != eltype(T)) @ArgumentError("fill: the (element) type of argument 'x' is not a normal number type ($(eltype(x))), but does not match the obtained (default) 'eltype' ($(eltype(T))); automatic conversion to $(eltype(T)) is therefore not attempted. Set the keyword argument 'eltype' accordingly to the element type of 'x' or pass an 'x' of a different (element) type.") end + check_datatype_metal(T, Bool, Enum) + if (length(x) == 1) cell = convert(T, fill(convert(eltype(T), x), size(T))) + elseif (length(x) == length(T)) cell = convert(T, x) + else @ArgumentError("fill: argument 'x' contains the wrong number of elements ($(length(x))). It must be a scalar or contain the number of elements defined by 'celldims'.") + end + return CellArrays.fill!(MtlCellArray{T,B}(undef, args...), cell) +end -# ParallelStencil.ParallelKernel.fill_metal!(A, x) = Metal.fill!(A, construct_cell(A, x)) +ParallelStencil.ParallelKernel.fill_metal!(A, x) = Metal.fill!(A, construct_cell(A, x)) check_datatype_metal(args...) = check_datatype(args..., INT_METAL) \ No newline at end of file diff --git a/src/ParallelKernel/MetalExt/shared.jl b/src/ParallelKernel/MetalExt/shared.jl index 686a51aa..ffcb011f 100644 --- a/src/ParallelKernel/MetalExt/shared.jl +++ b/src/ParallelKernel/MetalExt/shared.jl @@ -11,7 +11,7 @@ import Metal.MTL ParallelStencil.ParallelKernel.is_loaded(::Val{:ParallelStencil_MetalExt}) = true ## FUNCTIONS TO GET CREATE AND MANAGE METAL QUEUES -ParallelStencil.ParallelKernel.get_priority_stream(arg...) = get_priority_metalstream(arg...) +ParallelStencil.ParallelKernel.get_priority_metalstream(arg...) = get_priority_metalstream(arg...) ParallelStencil.ParallelKernel.get_metalstream(arg...) = get_metalstream(arg...) let @@ -20,12 +20,12 @@ let metalqueues = Array{MTL.MTLCommandQueue}(undef, 0) function get_priority_metalstream(id::Integer) - while (id > length(priority_metalqueues)) push!(priority_metalqueues, MTL.MTLCommandQueue(device())) end # No priority setting available in Metal queues. + while (id > length(priority_metalqueues)) push!(priority_metalqueues, MTL.MTLCommandQueue(Metal.current_device())) end # No priority setting available in Metal queues. return priority_metalqueues[id] end function get_metalstream(id::Integer) - while (id > length(metalqueues)) push!(metalqueues, MTL.MTLCommandQueue(device())) end + while (id > length(metalqueues)) push!(metalqueues, MTL.MTLCommandQueue(Metal.current_device())) end return metalqueues[id] end end \ No newline at end of file diff --git a/src/ParallelKernel/parallel.jl b/src/ParallelKernel/parallel.jl index 0e24b96b..334003c9 100644 --- a/src/ParallelKernel/parallel.jl +++ b/src/ParallelKernel/parallel.jl @@ -598,7 +598,7 @@ end function default_stream(package) if (package == PKG_CUDA) return :(CUDA.stream()) # Use the default stream of the task. elseif (package == PKG_AMDGPU) return :(AMDGPU.stream()) # Use the default stream of the task. - elseif (package == PKG_METAL) return :(Metal.global_queue(device())) # Use the default queue of the task. + elseif (package == PKG_METAL) return :(Metal.global_queue(Metal.current_device())) # Use the default queue of the task. else @ModuleInternalError("unsupported GPU package (obtained: $package).") end end \ No newline at end of file diff --git a/src/ParallelKernel/shared.jl b/src/ParallelKernel/shared.jl index a2deb486..a22520a6 100644 --- a/src/ParallelKernel/shared.jl +++ b/src/ParallelKernel/shared.jl @@ -16,7 +16,7 @@ const PKG_METAL = :Metal const PKG_THREADS = :Threads const PKG_POLYESTER = :Polyester const PKG_NONE = :PKG_NONE -const SUPPORTED_PACKAGES = [PKG_THREADS, PKG_POLYESTER, PKG_CUDA, PKG_AMDGPU] +const SUPPORTED_PACKAGES = [PKG_THREADS, PKG_POLYESTER, PKG_CUDA, PKG_AMDGPU, PKG_METAL] const INT_CUDA = Int64 # NOTE: unsigned integers are not yet supported (proper negative offset and range is dealing missing) const INT_AMDGPU = Int64 # NOTE: ... const INT_METAL = Int64 # NOTE: ... diff --git a/src/kernel_language.jl b/src/kernel_language.jl index 6c7e4dd2..cfc5c819 100644 --- a/src/kernel_language.jl +++ b/src/kernel_language.jl @@ -150,12 +150,12 @@ $((:( $A_head = @sharedMem(eltype($A), (Int64($nx_l), Int64 for (A, s) in shmem_vars for (shmem_offset, nx_l, ny_l, A_head) = ((shmem_exprs[A][:offset], s[:nx_l], s[:ny_l], s[:A_head]),) )... ) -$((:( $reg = 0.0 # e.g. A_ixm1_iyp2_izp2 = 0.0 +$((:( $reg = 0 # e.g. A_ixm1_iyp2_izp2 = 0 ) for A in optvars for regs in values(regqueue_tails[A]) for reg in values(regs) )... ) -$((:( $reg = 0.0 # e.g. A_ixm1_iyp2_izp3 = 0.0 +$((:( $reg = 0 # e.g. A_ixm1_iyp2_izp3 = 0 ) for A in optvars for regs in values(regqueue_heads[A]) for reg in values(regs) )... diff --git a/test/ParallelKernel/test_allocators.jl b/test/ParallelKernel/test_allocators.jl index 6ae628c1..767d1333 100644 --- a/test/ParallelKernel/test_allocators.jl +++ b/test/ParallelKernel/test_allocators.jl @@ -2,7 +2,7 @@ using Test using CellArrays, StaticArrays import ParallelStencil using ParallelStencil.ParallelKernel -import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, @get_numbertype, NUMBERTYPE_NONE, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU +import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, @get_numbertype, NUMBERTYPE_NONE, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_POLYESTER import ParallelStencil.ParallelKernel: @require, @prettystring, @gorgeousstring import ParallelStencil.ParallelKernel: checkargs_CellType, _CellType using ParallelStencil.ParallelKernel.Exceptions @@ -17,6 +17,14 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end @define_ROCCellArray end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + #@define_MetalCellArray +end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not work in combination with @reset_parallel_kernel, because the macros from module Test alternate the order of evaluation, resulting in the Data module being replaced with an empty module before Data.Index is evaluated. If at some point the indexing varies depending on the used package, then something more sophisticated is needed here (e.g., wrapping the test for each package in a module and using then Data.Index everywhere). @@ -129,6 +137,17 @@ const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not @test typeof(@fill(9, 2,3)) == typeof(AMDGPU.ROCArray(fill(convert(Float16, 9), 2,3))) @test typeof(@fill(9, 2,3, eltype=Float64)) == typeof(AMDGPU.ROCArray(fill(convert(Float64, 9), 2,3))) @test typeof(@fill(9, 2,3, eltype=DATA_INDEX)) == typeof(AMDGPU.ROCArray(fill(convert(DATA_INDEX, 9), 2,3))) + elseif $package == $PKG_METAL + @test typeof(@zeros(2,3)) == typeof(Metal.MtlArray(zeros(Float16,2,3))) + @test typeof(@zeros(2,3, eltype=Float32)) == typeof(Metal.MtlArray(zeros(Float32,2,3))) + @test typeof(@zeros(2,3, eltype=DATA_INDEX)) == typeof(Metal.MtlArray(zeros(DATA_INDEX,2,3))) + @test typeof(@ones(2,3)) == typeof(Metal.MtlArray(ones(Float16,2,3))) + @test typeof(@ones(2,3, eltype=Float32)) == typeof(Metal.MtlArray(ones(Float32,2,3))) + @test typeof(@ones(2,3, eltype=DATA_INDEX)) == typeof(Metal.MtlArray(ones(DATA_INDEX,2,3))) + @test typeof(@rand(2,3)) == typeof(Metal.MtlArray(rand(Float16,2,3))) + @test typeof(@rand(2,3, eltype=DATA_INDEX)) == typeof(Metal.MtlArray(rand(DATA_INDEX,2,3))) + @test typeof(@fill(9, 2,3)) == typeof(Metal.MtlArray(fill(convert(Float16, 9), 2,3))) + @test typeof(@fill(9, 2,3, eltype=DATA_INDEX)) == typeof(Metal.MtlArray(fill(convert(DATA_INDEX, 9), 2,3))) else @test typeof(@zeros(2,3)) == typeof(parentmodule($package).zeros(Float16,2,3)) @test typeof(@zeros(2,3, eltype=Float32)) == typeof(parentmodule($package).zeros(Float32,2,3)) @@ -180,6 +199,16 @@ const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not @test @trues(2,3, celldims=(3,4)) == CellArrays.fill!(ROCCellArray{T_Bool}(undef,2,3), trues((3,4))) @test @zeros(2,3, celldims=(3,4), eltype=DATA_INDEX) == CellArrays.fill!(ROCCellArray{T_Index}(undef,2,3), T_Index(zeros((3,4)))) AMDGPU.allowscalar(false) #TODO: check how to do + elseif $package == $PKG_METAL + # @test @zeros(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Float16}(undef,2,3), T_Float16(zeros((3,4)))) + # @test @zeros(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MtlCellArray{T_Float32}(undef,2,3), T_Float32(zeros((3,4)))) + # @test @ones(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Float16}(undef,2,3), T_Float16(ones((3,4)))) + # @test @ones(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MtlCellArray{T_Float32}(undef,2,3), T_Float32(ones((3,4)))) + # @test typeof(@rand(2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Float16,0}(undef,2,3)) + # @test typeof(@fill(9, 2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Float16,0}(undef,2,3)) + # @test @falses(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Bool}(undef,2,3), falses((3,4))) + # @test @trues(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Bool}(undef,2,3), trues((3,4))) + # @test @zeros(2,3, celldims=(3,4), eltype=DATA_INDEX) == CellArrays.fill!(MtlCellArray{T_Index}(undef,2,3), T_Index(zeros((3,4)))) else @test @zeros(2,3, celldims=(3,4)) == CellArrays.fill!(CPUCellArray{T_Float16}(undef,2,3), T_Float16(zeros((3,4)))) @test @zeros(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(CPUCellArray{T_Float32}(undef,2,3), T_Float32(zeros((3,4)))) @@ -219,6 +248,15 @@ const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not @test typeof(@fill(9, 2,3, celltype=SymmetricTensor2D)) == typeof(ROCCellArray{SymmetricTensor2D,0}(undef,2,3)) @test @zeros(2,3, celltype=SymmetricTensor2D_Index) == CellArrays.fill!(ROCCellArray{SymmetricTensor2D_Index}(undef,2,3), SymmetricTensor2D_Index(zeros(3))) AMDGPU.allowscalar(false) + elseif $package == $PKG_METAL + # @test @zeros(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(zeros(3))) + # @test @zeros(2,3, celltype=SymmetricTensor3D) == CellArrays.fill!(MtlCellArray{SymmetricTensor3D}(undef,2,3), SymmetricTensor3D(zeros(6))) + # @test @zeros(2,3, celltype=Tensor2D) == CellArrays.fill!(MtlCellArray{Tensor2D}(undef,2,3), Tensor2D(zeros((2,2,2,2)))) + # @test @zeros(2,3, celltype=SymmetricTensor2D_Float32) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D_Float32}(undef,2,3), SymmetricTensor2D_Float32(zeros(3))) + # @test @ones(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(ones(3))) + # @test typeof(@rand(2,3, celltype=SymmetricTensor2D)) == typeof(MtlCellArray{SymmetricTensor2D,0}(undef,2,3)) + # @test typeof(@fill(9, 2,3, celltype=SymmetricTensor2D)) == typeof(MtlCellArray{SymmetricTensor2D,0}(undef,2,3)) + # @test @zeros(2,3, celltype=SymmetricTensor2D_Index) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D_Index}(undef,2,3), SymmetricTensor2D_Index(zeros(3))) else @test @zeros(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(CPUCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(zeros(3))) @test @zeros(2,3, celltype=SymmetricTensor3D) == CellArrays.fill!(CPUCellArray{SymmetricTensor3D}(undef,2,3), SymmetricTensor3D(zeros(6))) @@ -265,6 +303,10 @@ const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not @test typeof(@rand(2,3, eltype=Float64)) == typeof(AMDGPU.ROCArray(rand(Float64,2,3))) @test typeof(@fill(9, 2,3, eltype=Float64)) == typeof(AMDGPU.ROCArray(fill(convert(Float64, 9), 2,3))) @test typeof(@zeros(2,3, eltype=DATA_INDEX)) == typeof(AMDGPU.ROCArray(zeros(DATA_INDEX,2,3))) + elseif $package == $PKG_METAL + @test typeof(@zeros(2,3, eltype=Float32)) == typeof(Metal.MtlArray(zeros(Float32,2,3))) + @test typeof(@ones(2,3, eltype=Float32)) == typeof(Metal.MtlArray(ones(Float32,2,3))) + @test typeof(@zeros(2,3, eltype=DATA_INDEX)) == typeof(Metal.MtlArray(zeros(DATA_INDEX,2,3))) else @test typeof(@zeros(2,3, eltype=Float32)) == typeof(zeros(Float32,2,3)) @test typeof(@ones(2,3, eltype=Float32)) == typeof(ones(Float32,2,3)) @@ -298,6 +340,11 @@ const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not @test @falses(2,3, celldims=(3,4)) == CellArrays.fill!(ROCCellArray{T_Bool}(undef,2,3), falses((3,4))) @test @trues(2,3, celldims=(3,4)) == CellArrays.fill!(ROCCellArray{T_Bool}(undef,2,3), trues((3,4))) AMDGPU.allowscalar(false) + elseif $package == $PKG_METAL + # @test @zeros(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MetalCellArray{T_Float32}(undef,2,3), T_Float32(zeros((3,4)))) + # @test @ones(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MetalCellArray{T_Float32}(undef,2,3), T_Float32(ones((3,4)))) + # @test @falses(2,3, celldims=(3,4)) == CellArrays.fill!(MetalCellArray{T_Bool}(undef,2,3), falses((3,4))) + # @test @trues(2,3, celldims=(3,4)) == CellArrays.fill!(MetalCellArray{T_Bool}(undef,2,3), trues((3,4))) else @test @zeros(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(CPUCellArray{T_Float32}(undef,2,3), T_Float32(zeros((3,4)))) @test @ones(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(CPUCellArray{T_Float32}(undef,2,3), T_Float32(ones((3,4)))) @@ -330,6 +377,14 @@ const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not @test typeof(@rand(2,3, celltype=SymmetricTensor2D)) == typeof(ROCCellArray{SymmetricTensor2D,0}(undef,2,3)) @test typeof(@fill(9, 2,3, celltype=SymmetricTensor2D)) == typeof(ROCCellArray{SymmetricTensor2D,0}(undef,2,3)) AMDGPU.allowscalar(false) + elseif $package == $PKG_METAL + # @test @zeros(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MetalCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(zeros(3))) + # @test @zeros(2,3, celltype=SymmetricTensor3D) == CellArrays.fill!(MetalCellArray{SymmetricTensor3D}(undef,2,3), SymmetricTensor3D(zeros(6))) + # @test @zeros(2,3, celltype=Tensor2D) == CellArrays.fill!(MetalCellArray{Tensor2D}(undef,2,3), Tensor2D(zeros((2,2,2,2)))) + # @test @zeros(2,3, celltype=SymmetricTensor2D_Float32) == CellArrays.fill!(MetalCellArray{SymmetricTensor2D_Float32}(undef,2,3), SymmetricTensor2D_Float32(zeros(3))) + # @test @ones(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MetalCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(ones(3))) + # @test typeof(@rand(2,3, celltype=SymmetricTensor2D)) == typeof(MetalCellArray{SymmetricTensor2D,0}(undef,2,3)) + # @test typeof(@fill(9, 2,3, celltype=SymmetricTensor2D)) == typeof(MetalCellArray{SymmetricTensor2D,0}(undef,2,3)) else @test @zeros(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(CPUCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(zeros(3))) @test @zeros(2,3, celltype=SymmetricTensor3D) == CellArrays.fill!(CPUCellArray{SymmetricTensor3D}(undef,2,3), SymmetricTensor3D(zeros(6))) @@ -368,6 +423,13 @@ const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not @test typeof( @falses(2,3, celldims=(3,4))) == typeof(ROCCellArray{T_Bool, 0}(undef,2,3)) @test typeof( @trues(2,3, celldims=(3,4))) == typeof(ROCCellArray{T_Bool, 0}(undef,2,3)) AMDGPU.allowscalar(false) + elseif $package == $PKG_METAL + # @test typeof( @zeros(2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Float16,0}(undef,2,3)) + # @test typeof( @ones(2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Float16,0}(undef,2,3)) + # @test typeof( @rand(2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Float16,0}(undef,2,3)) + # @test typeof(@fill(9, 2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Float16,0}(undef,2,3)) + # @test typeof( @falses(2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Bool, 0}(undef,2,3)) + # @test typeof( @trues(2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Bool, 0}(undef,2,3)) else @test typeof( @zeros(2,3, celldims=(3,4))) == typeof(CPUCellArray{T_Float16,1}(undef,2,3)) @test typeof( @ones(2,3, celldims=(3,4))) == typeof(CPUCellArray{T_Float16,1}(undef,2,3)) @@ -408,6 +470,19 @@ const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not @test typeof( @falses(2,3, celldims=(3,4), blocklength=3)) == typeof(ROCCellArray{T_Bool, 3}(undef,2,3)) @test typeof( @trues(2,3, celldims=(3,4), blocklength=3)) == typeof(ROCCellArray{T_Bool, 3}(undef,2,3)) AMDGPU.allowscalar(false) + elseif $package == $PKG_METAL + # @test typeof( @zeros(2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Float16,1}(undef,2,3)) + # @test typeof( @ones(2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Float16,1}(undef,2,3)) + # @test typeof( @rand(2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Float16,1}(undef,2,3)) + # @test typeof(@fill(9, 2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Float16,1}(undef,2,3)) + # @test typeof( @falses(2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Bool, 1}(undef,2,3)) + # @test typeof( @trues(2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Bool, 1}(undef,2,3)) + # @test typeof( @zeros(2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Float16,3}(undef,2,3)) + # @test typeof( @ones(2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Float16,3}(undef,2,3)) + # @test typeof( @rand(2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Float16,3}(undef,2,3)) + # @test typeof(@fill(9, 2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Float16,3}(undef,2,3)) + # @test typeof( @falses(2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Bool, 3}(undef,2,3)) + # @test typeof( @trues(2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Bool, 3}(undef,2,3)) else @test typeof( @zeros(2,3, celldims=(3,4), blocklength=0)) == typeof(CPUCellArray{T_Float16,0}(undef,2,3)) @test typeof( @ones(2,3, celldims=(3,4), blocklength=0)) == typeof(CPUCellArray{T_Float16,0}(undef,2,3)) @@ -447,6 +522,12 @@ const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not @test typeof(@fill(solid, 2,3, celldims=(3,4), eltype=Phase)) == typeof(ROCCellArray{T_Phase,0}(undef,2,3)) @test typeof(@fill(@rand(3,4,eltype=Phase), 2,3, celldims=(3,4), eltype=Phase)) == typeof(ROCCellArray{T_Phase,0}(undef,2,3)) AMDGPU.allowscalar(false) + elseif $package == $PKG_METAL + # @test typeof(@rand(2,3, eltype=Phase)) == typeof(Metal.MtlArray(rand(Phase, 2,3))) + # @test typeof(@rand(2,3, celldims=(3,4), eltype=Phase)) == typeof(MetalCellArray{T_Phase,0}(undef,2,3)) + # @test typeof(@fill(solid, 2,3, eltype=Phase)) == typeof(Metal.MtlArray(rand(Phase, 2,3))) + # @test typeof(@fill(solid, 2,3, celldims=(3,4), eltype=Phase)) == typeof(MetalCellArray{T_Phase,0}(undef,2,3)) + # @test typeof(@fill(@rand(3,4,eltype=Phase), 2,3, celldims=(3,4), eltype=Phase)) == typeof(MetalCellArray{T_Phase,0}(undef,2,3)) else @test typeof(@rand(2,3, eltype=Phase)) == typeof(rand(Phase, 2,3)) @test typeof(@rand(2,3, celldims=(3,4), eltype=Phase)) == typeof(CPUCellArray{T_Phase,1}(undef,2,3)) diff --git a/test/ParallelKernel/test_hide_communication.jl b/test/ParallelKernel/test_hide_communication.jl index 4cbc2e1c..402b61e7 100644 --- a/test/ParallelKernel/test_hide_communication.jl +++ b/test/ParallelKernel/test_hide_communication.jl @@ -1,7 +1,7 @@ using Test import ParallelStencil using ParallelStencil.ParallelKernel -import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU +import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_POLYESTER import ParallelStencil.ParallelKernel: @require, @prettyexpand, @gorgeousexpand, gorgeousstring, @isgpu import ParallelStencil.ParallelKernel: checkargs_hide_communication, hide_communication_gpu using ParallelStencil.ParallelKernel.Exceptions @@ -14,13 +14,20 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin @testset "1. hide_communication macro" begin @require !@is_initialized() - @init_parallel_kernel($package, Float64) + @init_parallel_kernel($package, Float32) @require @is_initialized() @testset "@hide_communication boundary_width block (macro expansion)" begin @static if @isgpu($package) @@ -82,7 +89,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel add_indices!(A); communication!(A); end - @test all(Array(A) .== communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .≈ communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width block" begin # This test verifies that the results are correct, even for CUDA.jl < v2.0, where it cannot overlap. A = @zeros(6, 7, 8) @@ -95,7 +102,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t communication_y!(A); communication_z!(A); end - @test all(Array(A) .== communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .≈ communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width block" begin A = @zeros(6, 7, 8) @@ -110,7 +117,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t communication_y!(A); communication_z!(A); end - @test all(Array(A) .== communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .≈ communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width computation_calls=2 block" begin A = @zeros(6, 7, 8) @@ -119,7 +126,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel add_indices2!(A); communication!(A); end - @test all(Array(A) .== communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .≈ communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width computation_calls=2 block" begin A = @zeros(6, 7, 8) @@ -128,7 +135,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel (1:6, 1:7, 1:8) add_indices2!(A); communication!(A); end - @test all(Array(A) .== communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .≈ communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width computation_calls=2 block" begin A = @zeros(6, 7, 8) @@ -137,7 +144,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel (1:6, 1:7, 1:8) add_indices2!(A); communication!(A); end - @test all(Array(A) .== communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .≈ communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width computation_calls=3 block" begin A = @zeros(6, 7, 8) @@ -147,7 +154,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel add_indices3!(A); communication!(A); end - @test all(Array(A) .== communication!([3*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .≈ communication!([3*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication ranges_outer ranges_inner block" begin A = @zeros(6, 7, 8) @@ -157,14 +164,14 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel add_indices!(A); communication!(A); end - @test all(Array(A) .== communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .≈ communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; end; @reset_parallel_kernel() end; @testset "2. Exceptions" begin @require !@is_initialized() - @init_parallel_kernel($package, Float64) + @init_parallel_kernel($package, Float32) @require @is_initialized @testset "arguments @hide_communication" begin @test_throws ArgumentError checkargs_hide_communication(:boundary_width, :block) # Error: the last argument must be a code block. diff --git a/test/ParallelKernel/test_init_parallel_kernel.jl b/test/ParallelKernel/test_init_parallel_kernel.jl index 39e62f72..fe4ab4b5 100644 --- a/test/ParallelKernel/test_init_parallel_kernel.jl +++ b/test/ParallelKernel/test_init_parallel_kernel.jl @@ -1,7 +1,7 @@ using Test import ParallelStencil using ParallelStencil.ParallelKernel -import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, @get_package, @get_numbertype, @get_inbounds, NUMBERTYPE_NONE, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU +import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, @get_package, @get_numbertype, @get_inbounds, NUMBERTYPE_NONE, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_POLYESTER import ParallelStencil.ParallelKernel: @require, @symbols import ParallelStencil.ParallelKernel: extract_posargs_init, extract_kwargs_init, check_already_initialized, set_initialized, is_initialized, check_initialized using ParallelStencil.ParallelKernel.Exceptions @@ -14,6 +14,13 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl index 3b6da0dc..eb56a91b 100644 --- a/test/ParallelKernel/test_kernel_language.jl +++ b/test/ParallelKernel/test_kernel_language.jl @@ -1,7 +1,7 @@ using Test import ParallelStencil using ParallelStencil.ParallelKernel -import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_THREADS, PKG_POLYESTER +import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_THREADS, PKG_POLYESTER import ParallelStencil.ParallelKernel: @require, @prettystring, @iscpu import ParallelStencil.ParallelKernel: checknoargs, checkargs_sharedMem, Dim3 using ParallelStencil.ParallelKernel.Exceptions @@ -14,13 +14,25 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin @testset "1. kernel language macros" begin @require !@is_initialized() - @init_parallel_kernel($package, Float64) + # @static if $package == $PKG_METAL + # @init_parallel_kernel($package, Float32) + # else + # @init_parallel_kernel($package, Float64) + # end + @init_parallel_kernel($package, Float32) @require @is_initialized() @testset "mapping to package" begin if $package == $PKG_CUDA @@ -41,6 +53,15 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t # @test @prettystring(1, @sharedMem(Float32, (2,3))) == "" #TODO: not yet supported for AMDGPU # @test @prettystring(1, @pk_show()) == "CUDA.@cushow" #TODO: not yet supported for AMDGPU # @test @prettystring(1, @pk_println()) == "AMDGPU.@rocprintln" + elseif $package == $PKG_METAL + @test @prettystring(1, @gridDim()) == "Metal.threadgroups_per_grid_3d()" + @test @prettystring(1, @blockIdx()) == "Metal.threadgroup_position_in_grid_3d()" + @test @prettystring(1, @blockDim()) == "Metal.threads_per_threadgroup_3d()" + @test @prettystring(1, @threadIdx()) == "Metal.thread_position_in_threadgroup_3d()" + @test @prettystring(1, @sync_threads()) == "Metal.threadgroup_barrier(; flag = Metal.MemoryFlagThreadGroup)" + @test @prettystring(1, @sharedMem(Float32, (2,3))) == "ParallelStencil.ParallelKernel.@sharedMem_metal Float32 (2, 3)" + # @test @prettystring(1, @pk_show()) == "Metal.@mtlshow" + # @test @prettystring(1, @pk_println()) == "Metal.@mtlprintln" elseif @iscpu($package) @test @prettystring(1, @gridDim()) == "ParallelStencil.ParallelKernel.@gridDim_cpu" @test @prettystring(1, @blockIdx()) == "ParallelStencil.ParallelKernel.@blockIdx_cpu" @@ -193,7 +214,12 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @reset_parallel_kernel() end; @testset "2. Exceptions" begin - @init_parallel_kernel($package, Float64) + # @static if $package == $PKG_METAL + # @init_parallel_kernel($package, Float32) + # else + # @init_parallel_kernel($package, Float64) + # end + @init_parallel_kernel($package, Float32) @require @is_initialized @testset "no arguments" begin @test_throws ArgumentError checknoargs(:(something)); # Error: length(args) != 0 diff --git a/test/ParallelKernel/test_parallel.jl b/test/ParallelKernel/test_parallel.jl index 021e69fc..5965c791 100644 --- a/test/ParallelKernel/test_parallel.jl +++ b/test/ParallelKernel/test_parallel.jl @@ -3,7 +3,7 @@ import ParallelStencil using Enzyme using ParallelStencil.ParallelKernel import ParallelStencil.ParallelKernel.AD -import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_THREADS, PKG_POLYESTER, INDICES +import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_THREADS, PKG_POLYESTER, INDICES import ParallelStencil.ParallelKernel: @require, @prettystring, @gorgeousstring, @isgpu, @iscpu import ParallelStencil.ParallelKernel: checkargs_parallel, checkargs_parallel_indices, parallel_indices, maxsize using ParallelStencil.ParallelKernel.Exceptions @@ -16,6 +16,10 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester end @@ -28,7 +32,12 @@ import Enzyme @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin @testset "1. parallel macros" begin @require !@is_initialized() - @init_parallel_kernel($package, Float64) + # @static if $package == $PKG_METAL + # @init_parallel_kernel($package, Float32) + # else + # @init_parallel_kernel($package, Float64) + # end + @init_parallel_kernel($package, Float32) @require @is_initialized() @testset "@parallel" begin @static if $package == $PKG_CUDA @@ -55,6 +64,8 @@ import Enzyme @test occursin("AMDGPU.@roc gridsize = nblocks groupsize = nthreads stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) call = @prettystring(1, @parallel nblocks nthreads stream=mystream f(A)) @test occursin("AMDGPU.@roc gridsize = nblocks groupsize = nthreads stream = mystream f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[3])))", call) + elseif $package == $PKG_METAL + ## TODO elseif @iscpu($package) @test @prettystring(1, @parallel f(A)) == "f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))" @test @prettystring(1, @parallel ranges f(A)) == "f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))" @@ -70,7 +81,7 @@ import Enzyme @testset "maxsize" begin struct BitstypeStruct x::Int - y::Float64 + y::Float32 end @test maxsize([9 9; 9 9; 9 9]) == (3, 2, 1) @test maxsize(8) == (1, 1, 1) @@ -101,8 +112,8 @@ import Enzyme B̄ = @ones(N) A_ref = Array(A) B_ref = Array(B) - Ā_ref = ones(N) - B̄_ref = ones(N) + Ā_ref = ones(Float32, N) + B̄_ref = ones(Float32, N) @parallel_indices (ix) function f!(A, B, a) A[ix] += a * B[ix] * 100.65 return @@ -289,7 +300,7 @@ import Enzyme return end @parallel write_indices!(A); - @test all(Array(A) .== [ix for ix=1:size(A,1)]) + @test all(Array(A) .≈ [ix for ix=1:size(A,1)]) end; @testset "@parallel_indices (2D)" begin A = @zeros(4, 5) @@ -298,7 +309,7 @@ import Enzyme return end @parallel write_indices!(A); - @test all(Array(A) .== [ix + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) + @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) end; @testset "@parallel_indices (3D)" begin A = @zeros(4, 5, 6) @@ -307,7 +318,7 @@ import Enzyme return end @parallel write_indices!(A); - @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end; @testset "@parallel_indices (1D in 3D)" begin A = @zeros(4, 5, 6) @@ -316,7 +327,7 @@ import Enzyme return end @parallel 1:size(A,2) write_indices!(A); - @test all(Array(A)[1,:,1] .== [iy for iy=1:size(A,2)]) + @test all(Array(A)[1,:,1] .≈ [iy for iy=1:size(A,2)]) end; @testset "@parallel_indices (2D in 3D)" begin A = @zeros(4, 5, 6) @@ -325,7 +336,7 @@ import Enzyme return end @parallel (1:size(A,1), 1:size(A,3)) write_indices!(A); - @test all(Array(A)[:,end,:] .== [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) + @test all(Array(A)[:,end,:] .≈ [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) end; @testset "@parallel_indices (2D in 3D with macro)" begin A = @zeros(4, 5, 6) @@ -334,7 +345,7 @@ import Enzyme return end @parallel (1:size(A,1), 1:size(A,3)) write_indices!(A); - @test all(Array(A)[:,end,:] .== [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) + @test all(Array(A)[:,end,:] .≈ [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) end; @testset "@parallel_indices (2D in 3D with macro with aliases)" begin A = @zeros(4, 5, 6) @@ -343,7 +354,7 @@ import Enzyme return end @parallel (1:size(A,1), 1:size(A,3)) write_indices!(A); - @test all(Array(A)[:,end,:] .== [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) + @test all(Array(A)[:,end,:] .≈ [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) end; @static if $package != $PKG_POLYESTER @testset "nested function (long definition, array modification)" begin @@ -357,7 +368,7 @@ import Enzyme return end @parallel write_indices!(A); - @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end; @testset "nested function (short definition, array modification)" begin A = @zeros(4, 5, 6) @@ -367,7 +378,7 @@ import Enzyme return end @parallel write_indices!(A); - @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end; @testset "nested function (long definition, return value)" begin A = @zeros(4, 5, 6) @@ -379,7 +390,7 @@ import Enzyme return end @parallel write_indices!(A); - @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end; @testset "nested function (short definition, return value)" begin A = @zeros(4, 5, 6) @@ -389,7 +400,7 @@ import Enzyme return end @parallel write_indices!(A); - @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end; end end; @@ -411,14 +422,16 @@ import Enzyme @reset_parallel_kernel() end; @testset "2. parallel macros (literal conversion)" begin - @testset "@parallel_indices (Float64)" begin - @require !@is_initialized() - @init_parallel_kernel($package, Float64) - @require @is_initialized() - expansion = @gorgeousstring(@parallel_indices (ix) f!(A) = (A[ix] = A[ix] + 1.0f0; return)) - @test occursin("A[ix] = A[ix] + 1.0\n", expansion) - @reset_parallel_kernel() - end; + # @testset "@parallel_indices (Float64)" begin + # @require !@is_initialized() + # @static if $package == $PKG_METAL + # return + # end + # @require @is_initialized() + # expansion = @gorgeousstring(@parallel_indices (ix) f!(A) = (A[ix] = A[ix] + 1.0f0; return)) + # @test occursin("A[ix] = A[ix] + 1.0\n", expansion) + # @reset_parallel_kernel() + # end; @testset "@parallel_indices (Float32)" begin @require !@is_initialized() @init_parallel_kernel($package, Float32) @@ -463,7 +476,12 @@ import Enzyme @testset "3. global defaults" begin @testset "inbounds=true" begin @require !@is_initialized() - @init_parallel_kernel($package, Float64, inbounds=true) + # @static if $package == $PKG_METAL + # @init_parallel_kernel($package, Float32, inbounds=true) + # else + # @init_parallel_kernel($package, Float64, inbounds=true) + # end + @init_parallel_kernel($package, Float32, inbounds=true) @require @is_initialized expansion = @prettystring(1, @parallel_indices (ix) inbounds=true f(A) = (2*A; return)) @test occursin("Base.@inbounds begin", expansion) @@ -500,7 +518,12 @@ import Enzyme end; @testset "5. Exceptions" begin @require !@is_initialized() - @init_parallel_kernel($package, Float64) + # @static if $package == $PKG_METAL + # @init_parallel_kernel($package, Float32) + # else + # @init_parallel_kernel($package, Float64) + # end + @init_parallel_kernel($package, Float32) @require @is_initialized @testset "arguments @parallel" begin @test_throws ArgumentError checkargs_parallel(); # Error: isempty(args) diff --git a/test/ParallelKernel/test_reset_parallel_kernel.jl b/test/ParallelKernel/test_reset_parallel_kernel.jl index 4bbde1da..593a5e21 100644 --- a/test/ParallelKernel/test_reset_parallel_kernel.jl +++ b/test/ParallelKernel/test_reset_parallel_kernel.jl @@ -1,7 +1,7 @@ using Test import ParallelStencil using ParallelStencil.ParallelKernel -import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, @get_package, @get_numbertype, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_NONE, NUMBERTYPE_NONE +import ParallelStencil.ParallelKernel: @reset_parallel_kernel, @is_initialized, @get_package, @get_numbertype, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_POLYESTER, PKG_NONE, NUMBERTYPE_NONE import ParallelStencil.ParallelKernel: @require, @symbols TEST_PACKAGES = SUPPORTED_PACKAGES @static if PKG_CUDA in TEST_PACKAGES @@ -12,6 +12,13 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end +@static if PKG_METAL in TEST_PACKAGES + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end +end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( diff --git a/test/test_FiniteDifferences1D.jl b/test/test_FiniteDifferences1D.jl index 59578674..63934e13 100644 --- a/test/test_FiniteDifferences1D.jl +++ b/test/test_FiniteDifferences1D.jl @@ -1,6 +1,6 @@ using Test using ParallelStencil -import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU +import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_POLYESTER import ParallelStencil: @require using ParallelStencil.FiniteDifferences1D TEST_PACKAGES = SUPPORTED_PACKAGES @@ -16,12 +16,20 @@ end import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin @require !@is_initialized() - @init_parallel_stencil($package, Float64, 1) + # @static if $package == $PKG_METAL + # @init_parallel_stencil($package, Float32, 1) + # else + # @init_parallel_stencil($package, Float64, 1) + # end + @init_parallel_stencil($package, Float32, 1) @require @is_initialized() nx = 7 A = @rand(nx ); @@ -33,44 +41,44 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @testset "differences" begin @parallel d!(R, Ax) = (@all(R) = @d(Ax); return) @parallel d2!(R, Axx) = (@all(R) = @d2(Axx); return) - R.=0; @parallel d!(R, Ax); @test all(Array(R .== Ax[2:end].-Ax[1:end-1])) # INFO: AMDGPU arrays need to be compared on CPU - R.=0; @parallel d2!(R, Axx); @test all(Array(R .== (Axx[3:end].-Axx[2:end-1]).-(Axx[2:end-1].-Axx[1:end-2]))) + R.=0; @parallel d!(R, Ax); @test all(Array(R .≈ Ax[2:end].-Ax[1:end-1])) # INFO: AMDGPU arrays need to be compared on CPU + R.=0; @parallel d2!(R, Axx); @test all(Array(R .≈ (Axx[3:end].-Axx[2:end-1]).-(Axx[2:end-1].-Axx[1:end-2]))) end; @testset "selection" begin @parallel all!(R, A) = (@all(R) = @all(A); return) @parallel inn!(R, Axx) = (@all(R) = @inn(Axx); return) - R.=0; @parallel all!(R, A); @test all(Array(R .== A)) - R.=0; @parallel inn!(R, Axx); @test all(Array(R .== Axx[2:end-1])) + R.=0; @parallel all!(R, A); @test all(Array(R .≈ A)) + R.=0; @parallel inn!(R, Axx); @test all(Array(R .≈ Axx[2:end-1])) end; @testset "averages" begin @parallel av!(R, Ax) = (@all(R) = @av(Ax); return) - R.=0; @parallel av!(R, Ax); @test all(Array(R .== (Ax[1:end-1].+Ax[2:end]).*0.5)) + R.=0; @parallel av!(R, Ax); @test all(Array(R .≈ (Ax[1:end-1].+Ax[2:end])./2)) end; @testset "harmonic averages" begin @parallel harm!(R, Ax) = (@all(R) = @harm(Ax); return) - R.=0; @parallel harm!(R, Ax); @test all(Array(R .== 2 ./(1 ./Ax[1:end-1].+1 ./Ax[2:end]))) + R.=0; @parallel harm!(R, Ax); @test all(Array(R .≈ 2 ./(1 ./Ax[1:end-1].+1 ./Ax[2:end]))) end; @testset "others" begin @parallel maxloc!(R, Axx) = (@all(R) = @maxloc(Axx); return) - R.=0; @parallel maxloc!(R, Axx); @test all(Array(R .== max.(max.(Axx[3:end],Axx[2:end-1]),Axx[1:end-2]))) + R.=0; @parallel maxloc!(R, Axx); @test all(Array(R .≈ max.(max.(Axx[3:end],Axx[2:end-1]),Axx[1:end-2]))) end; end; @testset "2. apply masks" begin @testset "selection" begin @parallel inn_all!(Rxx, A) = (@inn(Rxx) = @all(A); return) @parallel inn_inn!(Rxx, Axx) = (@inn(Rxx) = @inn(Axx); return) - Rxx.=0; @parallel inn_all!(Rxx, A); @test all(Array(Rxx[2:end-1] .== A)) - Rxx[2:end-1].=0; @test all(Array(Rxx .== 0)) # Test that boundary values remained zero. - Rxx.=0; @parallel inn_inn!(Rxx, Axx); @test all(Array(Rxx[2:end-1] .== Axx[2:end-1])) - Rxx[2:end-1].=0; @test all(Array(Rxx .== 0)) # Test that boundary values remained zero. + Rxx.=0; @parallel inn_all!(Rxx, A); @test all(Array(Rxx[2:end-1] .≈ A)) + Rxx[2:end-1].=0; @test all(Array(Rxx .≈ 0)) # Test that boundary values remained zero. + Rxx.=0; @parallel inn_inn!(Rxx, Axx); @test all(Array(Rxx[2:end-1] .≈ Axx[2:end-1])) + Rxx[2:end-1].=0; @test all(Array(Rxx .≈ 0)) # Test that boundary values remained zero. end; @testset "differences" begin @parallel inn_d!(Rxx, Ax) = (@inn(Rxx) = @d(Ax); return) @parallel inn_d2!(Rxx, Axx) = (@inn(Rxx) = @d2(Axx); return) - Rxx.=0; @parallel inn_d!(Rxx, Ax); @test all(Array(Rxx[2:end-1] .== Ax[2:end].-Ax[1:end-1])) - Rxx[2:end-1].=0; @test all(Array(Rxx .== 0)) # Test that boundary values remained zero. - Rxx.=0; @parallel inn_d2!(Rxx, Axx); @test all(Array(Rxx[2:end-1] .== (Axx[3:end].-Axx[2:end-1]).-(Axx[2:end-1].-Axx[1:end-2]))) - Rxx[2:end-1].=0; @test all(Array(Rxx .== 0)) # Test that boundary values remained zero. + Rxx.=0; @parallel inn_d!(Rxx, Ax); @test all(Array(Rxx[2:end-1] .≈ Ax[2:end].-Ax[1:end-1])) + Rxx[2:end-1].=0; @test all(Array(Rxx .≈ 0)) # Test that boundary values remained zero. + Rxx.=0; @parallel inn_d2!(Rxx, Axx); @test all(Array(Rxx[2:end-1] .≈ (Axx[3:end].-Axx[2:end-1]).-(Axx[2:end-1].-Axx[1:end-2]))) + Rxx[2:end-1].=0; @test all(Array(Rxx .≈ 0)) # Test that boundary values remained zero. end; end; @reset_parallel_stencil() diff --git a/test/test_FiniteDifferences2D.jl b/test/test_FiniteDifferences2D.jl index 539cb365..4b094e2e 100644 --- a/test/test_FiniteDifferences2D.jl +++ b/test/test_FiniteDifferences2D.jl @@ -1,6 +1,6 @@ using Test using ParallelStencil -import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU +import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_POLYESTER import ParallelStencil: @require using ParallelStencil.FiniteDifferences2D TEST_PACKAGES = SUPPORTED_PACKAGES @@ -16,12 +16,15 @@ end import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin @require !@is_initialized() - @init_parallel_stencil($package, Float64, 2) + @init_parallel_stencil($package, Float32, 2) @require @is_initialized() nx, ny = 7, 5 A = @rand(nx, ny ); @@ -45,24 +48,24 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel d2_ya!(R, Ayy) = (@all(R) = @d2_ya(Ayy); return) @parallel d2_xi!(R, Axxyy) = (@all(R) = @d2_xi(Axxyy); return) @parallel d2_yi!(R, Axxyy) = (@all(R) = @d2_yi(Axxyy); return) - R.=0; @parallel d_xa!(R, Ax); @test all(Array(R .== Ax[2:end, :].-Ax[1:end-1, :])) - R.=0; @parallel d_ya!(R, Ay); @test all(Array(R .== Ay[ :,2:end].-Ay[ :,1:end-1])) - R.=0; @parallel d_xi!(R, Axyy); @test all(Array(R .== Axyy[2:end ,2:end-1].-Axyy[1:end-1,2:end-1])) - R.=0; @parallel d_yi!(R, Axxy); @test all(Array(R .== Axxy[2:end-1,2:end ].-Axxy[2:end-1,1:end-1])) - R.=0; @parallel d2_xa!(R, Axx); @test all(Array(R .== (Axx[3:end, :].-Axx[2:end-1, :]).-(Axx[2:end-1, :].-Axx[1:end-2, :]))) - R.=0; @parallel d2_ya!(R, Ayy); @test all(Array(R .== (Ayy[ :,3:end].-Ayy[ :,2:end-1]).-(Ayy[ :,2:end-1].-Ayy[ :,1:end-2]))) - R.=0; @parallel d2_xi!(R, Axxyy); @test all(Array(R .== (Axxyy[3:end,2:end-1].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[1:end-2,2:end-1]))) - R.=0; @parallel d2_yi!(R, Axxyy); @test all(Array(R .== (Axxyy[2:end-1,3:end].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[2:end-1,1:end-2]))) + R.=0; @parallel d_xa!(R, Ax); @test all(Array(R .≈ Ax[2:end, :].-Ax[1:end-1, :])) + R.=0; @parallel d_ya!(R, Ay); @test all(Array(R .≈ Ay[ :,2:end].-Ay[ :,1:end-1])) + R.=0; @parallel d_xi!(R, Axyy); @test all(Array(R .≈ Axyy[2:end ,2:end-1].-Axyy[1:end-1,2:end-1])) + R.=0; @parallel d_yi!(R, Axxy); @test all(Array(R .≈ Axxy[2:end-1,2:end ].-Axxy[2:end-1,1:end-1])) + R.=0; @parallel d2_xa!(R, Axx); @test all(Array(R .≈ (Axx[3:end, :].-Axx[2:end-1, :]).-(Axx[2:end-1, :].-Axx[1:end-2, :]))) + R.=0; @parallel d2_ya!(R, Ayy); @test all(Array(R .≈ (Ayy[ :,3:end].-Ayy[ :,2:end-1]).-(Ayy[ :,2:end-1].-Ayy[ :,1:end-2]))) + R.=0; @parallel d2_xi!(R, Axxyy); @test all(Array(R .≈ (Axxyy[3:end,2:end-1].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[1:end-2,2:end-1]))) + R.=0; @parallel d2_yi!(R, Axxyy); @test all(Array(R .≈ (Axxyy[2:end-1,3:end].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[2:end-1,1:end-2]))) end; @testset "selection" begin @parallel all!(R, A) = (@all(R) = @all(A); return) @parallel inn!(R, Axxyy) = (@all(R) = @inn(Axxyy); return) @parallel inn_x!(R, Axx) = (@all(R) = @inn_x(Axx); return) @parallel inn_y!(R, Ayy) = (@all(R) = @inn_y(Ayy); return) - R.=0; @parallel all!(R, A); @test all(Array(R .== A)) - R.=0; @parallel inn!(R, Axxyy); @test all(Array(R .== Axxyy[2:end-1,2:end-1])) - R.=0; @parallel inn_x!(R, Axx); @test all(Array(R .== Axx[2:end-1, :])) - R.=0; @parallel inn_y!(R, Ayy); @test all(Array(R .== Ayy[ :,2:end-1])) + R.=0; @parallel all!(R, A); @test all(Array(R .≈ A)) + R.=0; @parallel inn!(R, Axxyy); @test all(Array(R .≈ Axxyy[2:end-1,2:end-1])) + R.=0; @parallel inn_x!(R, Axx); @test all(Array(R .≈ Axx[2:end-1, :])) + R.=0; @parallel inn_y!(R, Ayy); @test all(Array(R .≈ Ayy[ :,2:end-1])) end; @testset "averages" begin @parallel av!(R, Axy) = (@all(R) = @av(Axy); return) @@ -70,11 +73,11 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel av_ya!(R, Ay) = (@all(R) = @av_ya(Ay); return) @parallel av_xi!(R, Axyy) = (@all(R) = @av_xi(Axyy); return) @parallel av_yi!(R, Axxy) = (@all(R) = @av_yi(Axxy); return) - R.=0; @parallel av!(R, Axy); @test all(Array(R .== (Axy[1:end-1,1:end-1].+Axy[2:end,1:end-1].+Axy[1:end-1,2:end].+Axy[2:end,2:end])*0.25)) - R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .== (Ax[2:end, :].+Ax[1:end-1, :]).*0.5)) - R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .== (Ay[ :,2:end].+Ay[ :,1:end-1]).*0.5)) - R.=0; @parallel av_xi!(R, Axyy); @test all(Array(R .== (Axyy[2:end ,2:end-1].+Axyy[1:end-1,2:end-1]).*0.5)) - R.=0; @parallel av_yi!(R, Axxy); @test all(Array(R .== (Axxy[2:end-1,2:end ].+Axxy[2:end-1,1:end-1]).*0.5)) + R.=0; @parallel av!(R, Axy); @test all(Array(R .≈ (Axy[1:end-1,1:end-1].+Axy[2:end,1:end-1].+Axy[1:end-1,2:end].+Axy[2:end,2:end])./4)) + R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .≈ (Ax[2:end, :].+Ax[1:end-1, :])./2)) + R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .≈ (Ay[ :,2:end].+Ay[ :,1:end-1])./2)) + R.=0; @parallel av_xi!(R, Axyy); @test all(Array(R .≈ (Axyy[2:end ,2:end-1].+Axyy[1:end-1,2:end-1])./2)) + R.=0; @parallel av_yi!(R, Axxy); @test all(Array(R .≈ (Axxy[2:end-1,2:end ].+Axxy[2:end-1,1:end-1])./2)) end; @testset "harmonic averages" begin @parallel harm!(R, Axy) = (@all(R) = @harm(Axy); return) @@ -82,36 +85,36 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel harm_ya!(R, Ay) = (@all(R) = @harm_ya(Ay); return) @parallel harm_xi!(R, Axyy) = (@all(R) = @harm_xi(Axyy); return) @parallel harm_yi!(R, Axxy) = (@all(R) = @harm_yi(Axxy); return) - R.=0; @parallel harm!(R, Axy); @test all(Array(R .== 4 ./(1 ./Axy[1:end-1,1:end-1].+1 ./Axy[2:end,1:end-1].+1 ./Axy[1:end-1,2:end].+1 ./Axy[2:end,2:end]))) - R.=0; @parallel harm_xa!(R, Ax); @test all(Array(R .== 2 ./(1 ./Ax[2:end, :].+1 ./Ax[1:end-1, :]))) - R.=0; @parallel harm_ya!(R, Ay); @test all(Array(R .== 2 ./(1 ./Ay[ :,2:end].+1 ./Ay[ :,1:end-1]))) - R.=0; @parallel harm_xi!(R, Axyy); @test all(Array(R .== 2 ./(1 ./Axyy[2:end ,2:end-1].+1 ./Axyy[1:end-1,2:end-1]))) - R.=0; @parallel harm_yi!(R, Axxy); @test all(Array(R .== 2 ./(1 ./Axxy[2:end-1,2:end ].+1 ./Axxy[2:end-1,1:end-1]))) + R.=0; @parallel harm!(R, Axy); @test all(Array(R .≈ 4 ./(1 ./Axy[1:end-1,1:end-1].+1 ./Axy[2:end,1:end-1].+1 ./Axy[1:end-1,2:end].+1 ./Axy[2:end,2:end]))) + R.=0; @parallel harm_xa!(R, Ax); @test all(Array(R .≈ 2 ./(1 ./Ax[2:end, :].+1 ./Ax[1:end-1, :]))) + R.=0; @parallel harm_ya!(R, Ay); @test all(Array(R .≈ 2 ./(1 ./Ay[ :,2:end].+1 ./Ay[ :,1:end-1]))) + R.=0; @parallel harm_xi!(R, Axyy); @test all(Array(R .≈ 2 ./(1 ./Axyy[2:end ,2:end-1].+1 ./Axyy[1:end-1,2:end-1]))) + R.=0; @parallel harm_yi!(R, Axxy); @test all(Array(R .≈ 2 ./(1 ./Axxy[2:end-1,2:end ].+1 ./Axxy[2:end-1,1:end-1]))) end; @testset "others" begin @parallel maxloc!(R, Axxyy) = (@all(R) = @maxloc(Axxyy); return) - R.=0; @parallel maxloc!(R, Axxyy); @test all(Array(R .== max.(max.(max.(max.(Axxyy[1:end-2,2:end-1],Axxyy[3:end,2:end-1]),Axxyy[2:end-1,2:end-1]),Axxyy[2:end-1,1:end-2]),Axxyy[2:end-1,3:end]))) + R.=0; @parallel maxloc!(R, Axxyy); @test all(Array(R .≈ max.(max.(max.(max.(Axxyy[1:end-2,2:end-1],Axxyy[3:end,2:end-1]),Axxyy[2:end-1,2:end-1]),Axxyy[2:end-1,1:end-2]),Axxyy[2:end-1,3:end]))) end; end; @testset "2. apply masks" begin @testset "selection" begin @parallel inn_all!(Rxxyy, A) = (@inn(Rxxyy) = @all(A); return) @parallel inn_inn!(Rxxyy, Axxyy) = (@inn(Rxxyy) = @inn(Axxyy); return) - Rxxyy.=0; @parallel inn_all!(Rxxyy, A); @test all(Array(Rxxyy[2:end-1,2:end-1] .== A)) - Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .== 0)) # Test that boundary values remained zero. - Rxxyy.=0; @parallel inn_inn!(Rxxyy, Axxyy); @test all(Array(Rxxyy[2:end-1,2:end-1] .== Axxyy[2:end-1,2:end-1])) - Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .== 0)) # Test that boundary values remained zero. + Rxxyy.=0; @parallel inn_all!(Rxxyy, A); @test all(Array(Rxxyy[2:end-1,2:end-1] .≈ A)) + Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .≈ 0)) # Test that boundary values remained zero. + Rxxyy.=0; @parallel inn_inn!(Rxxyy, Axxyy); @test all(Array(Rxxyy[2:end-1,2:end-1] .≈ Axxyy[2:end-1,2:end-1])) + Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .≈ 0)) # Test that boundary values remained zero. end; @testset "differences" begin @parallel inn_d_xa!(Rxxyy, Ax) = (@inn(Rxxyy) = @d_xa(Ax); return) @parallel inn_d_yi!(Rxxyy, Axxy) = (@inn(Rxxyy) = @d_yi(Axxy); return) @parallel inn_d2_yi!(Rxxyy, Axxyy) = (@inn(Rxxyy) = @d2_yi(Axxyy); return) - Rxxyy.=0; @parallel inn_d_xa!(Rxxyy, Ax); @test all(Array(Rxxyy[2:end-1,2:end-1] .== Ax[2:end, :].-Ax[1:end-1, :])) - Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .== 0)) # Test that boundary values remained zero. - Rxxyy.=0; @parallel inn_d_yi!(Rxxyy, Axxy); @test all(Array(Rxxyy[2:end-1,2:end-1] .== Axxy[2:end-1,2:end ].-Axxy[2:end-1,1:end-1])) - Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .== 0)) # Test that boundary values remained zero. - Rxxyy.=0; @parallel inn_d2_yi!(Rxxyy, Axxyy); @test all(Array(Rxxyy[2:end-1,2:end-1] .== (Axxyy[2:end-1,3:end].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[2:end-1,1:end-2]))) - Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .== 0)) # Test that boundary values remained zero. + Rxxyy.=0; @parallel inn_d_xa!(Rxxyy, Ax); @test all(Array(Rxxyy[2:end-1,2:end-1] .≈ Ax[2:end, :].-Ax[1:end-1, :])) + Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .≈ 0)) # Test that boundary values remained zero. + Rxxyy.=0; @parallel inn_d_yi!(Rxxyy, Axxy); @test all(Array(Rxxyy[2:end-1,2:end-1] .≈ Axxy[2:end-1,2:end ].-Axxy[2:end-1,1:end-1])) + Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .≈ 0)) # Test that boundary values remained zero. + Rxxyy.=0; @parallel inn_d2_yi!(Rxxyy, Axxyy); @test all(Array(Rxxyy[2:end-1,2:end-1] .≈ (Axxyy[2:end-1,3:end].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[2:end-1,1:end-2]))) + Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .≈ 0)) # Test that boundary values remained zero. end; end; @reset_parallel_stencil() diff --git a/test/test_FiniteDifferences3D.jl b/test/test_FiniteDifferences3D.jl index 2c0154b6..1ccdb7bb 100644 --- a/test/test_FiniteDifferences3D.jl +++ b/test/test_FiniteDifferences3D.jl @@ -1,6 +1,6 @@ using Test using ParallelStencil -import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU +import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_POLYESTER import ParallelStencil: @require using ParallelStencil.FiniteDifferences3D TEST_PACKAGES = SUPPORTED_PACKAGES @@ -16,12 +16,20 @@ end import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin @require !@is_initialized() - @init_parallel_stencil($package, Float64, 3) + # @static if $package == $PKG_METAL + # @init_parallel_stencil($package, Float32, 3) + # else + # @init_parallel_stencil($package, Float64, 3) + # end + @init_parallel_stencil($package, Float32, 3) @require @is_initialized() nx, ny, nz = 7, 5, 6 A = @rand(nx , ny , nz ); @@ -58,15 +66,15 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel d2_xi!(R, Axxyyzz) = (@all(R) = @d2_xi(Axxyyzz); return) @parallel d2_yi!(R, Axxyyzz) = (@all(R) = @d2_yi(Axxyyzz); return) @parallel d2_zi!(R, Axxyyzz) = (@all(R) = @d2_zi(Axxyyzz); return) - R.=0; @parallel d_xa!(R, Ax); @test all(Array(R .== Ax[2:end, :, :].-Ax[1:end-1, :, :])) - R.=0; @parallel d_ya!(R, Ay); @test all(Array(R .== Ay[ :,2:end, :].-Ay[ :,1:end-1, :])) - R.=0; @parallel d_za!(R, Az); @test all(Array(R .== Az[ :, :,2:end].-Az[ :, :,1:end-1])) - R.=0; @parallel d_xi!(R, Axyyzz); @test all(Array(R .== Axyyzz[2:end ,2:end-1,2:end-1].-Axyyzz[1:end-1,2:end-1,2:end-1])) - R.=0; @parallel d_yi!(R, Axxyzz); @test all(Array(R .== Axxyzz[2:end-1,2:end ,2:end-1].-Axxyzz[2:end-1,1:end-1,2:end-1])) - R.=0; @parallel d_zi!(R, Axxyyz); @test all(Array(R .== Axxyyz[2:end-1,2:end-1,2:end ].-Axxyyz[2:end-1,2:end-1,1:end-1])) - R.=0; @parallel d2_xi!(R, Axxyyzz); @test all(Array(R .== (Axxyyzz[3:end,2:end-1,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[1:end-2,2:end-1,2:end-1]))) - R.=0; @parallel d2_yi!(R, Axxyyzz); @test all(Array(R .== (Axxyyzz[2:end-1,3:end,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,1:end-2,2:end-1]))) - R.=0; @parallel d2_zi!(R, Axxyyzz); @test all(Array(R .== (Axxyyzz[2:end-1,2:end-1,3:end].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,2:end-1,1:end-2]))) + R.=0; @parallel d_xa!(R, Ax); @test all(Array(R .≈ Ax[2:end, :, :].-Ax[1:end-1, :, :])) + R.=0; @parallel d_ya!(R, Ay); @test all(Array(R .≈ Ay[ :,2:end, :].-Ay[ :,1:end-1, :])) + R.=0; @parallel d_za!(R, Az); @test all(Array(R .≈ Az[ :, :,2:end].-Az[ :, :,1:end-1])) + R.=0; @parallel d_xi!(R, Axyyzz); @test all(Array(R .≈ Axyyzz[2:end ,2:end-1,2:end-1].-Axyyzz[1:end-1,2:end-1,2:end-1])) + R.=0; @parallel d_yi!(R, Axxyzz); @test all(Array(R .≈ Axxyzz[2:end-1,2:end ,2:end-1].-Axxyzz[2:end-1,1:end-1,2:end-1])) + R.=0; @parallel d_zi!(R, Axxyyz); @test all(Array(R .≈ Axxyyz[2:end-1,2:end-1,2:end ].-Axxyyz[2:end-1,2:end-1,1:end-1])) + R.=0; @parallel d2_xi!(R, Axxyyzz); @test all(Array(R .≈ (Axxyyzz[3:end,2:end-1,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[1:end-2,2:end-1,2:end-1]))) + R.=0; @parallel d2_yi!(R, Axxyyzz); @test all(Array(R .≈ (Axxyyzz[2:end-1,3:end,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,1:end-2,2:end-1]))) + R.=0; @parallel d2_zi!(R, Axxyyzz); @test all(Array(R .≈ (Axxyyzz[2:end-1,2:end-1,3:end].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,2:end-1,1:end-2]))) end; @testset "selection" begin @parallel all!(R, A) = (@all(R) = @all(A); return) @@ -77,14 +85,14 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel inn_xy!(R, Axxyy) = (@all(R) = @inn_xy(Axxyy); return) @parallel inn_xz!(R, Axxzz) = (@all(R) = @inn_xz(Axxzz); return) @parallel inn_yz!(R, Ayyzz) = (@all(R) = @inn_yz(Ayyzz); return) - R.=0; @parallel all!(R, A); @test all(Array(R .== A)) - R.=0; @parallel inn!(R, Axxyyzz); @test all(Array(R .== Axxyyzz[2:end-1,2:end-1,2:end-1])) - R.=0; @parallel inn_x!(R, Axx); @test all(Array(R .== Axx[2:end-1, :, :])) - R.=0; @parallel inn_y!(R, Ayy); @test all(Array(R .== Ayy[ :,2:end-1, :])) - R.=0; @parallel inn_z!(R, Azz); @test all(Array(R .== Azz[ :, :,2:end-1])) - R.=0; @parallel inn_xy!(R, Axxyy); @test all(Array(R .== Axxyy[2:end-1,2:end-1, :])) - R.=0; @parallel inn_xz!(R, Axxzz); @test all(Array(R .== Axxzz[2:end-1, :,2:end-1])) - R.=0; @parallel inn_yz!(R, Ayyzz); @test all(Array(R .== Ayyzz[ :,2:end-1,2:end-1])) + R.=0; @parallel all!(R, A); @test all(Array(R .≈ A)) + R.=0; @parallel inn!(R, Axxyyzz); @test all(Array(R .≈ Axxyyzz[2:end-1,2:end-1,2:end-1])) + R.=0; @parallel inn_x!(R, Axx); @test all(Array(R .≈ Axx[2:end-1, :, :])) + R.=0; @parallel inn_y!(R, Ayy); @test all(Array(R .≈ Ayy[ :,2:end-1, :])) + R.=0; @parallel inn_z!(R, Azz); @test all(Array(R .≈ Azz[ :, :,2:end-1])) + R.=0; @parallel inn_xy!(R, Axxyy); @test all(Array(R .≈ Axxyy[2:end-1,2:end-1, :])) + R.=0; @parallel inn_xz!(R, Axxzz); @test all(Array(R .≈ Axxzz[2:end-1, :,2:end-1])) + R.=0; @parallel inn_yz!(R, Ayyzz); @test all(Array(R .≈ Ayyzz[ :,2:end-1,2:end-1])) end; @testset "averages" begin @parallel av!(R, Axyz) = (@all(R) = @av(Axyz); return) @@ -100,19 +108,19 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel av_xyi!(R, Axyzz) = (@all(R) = @av_xyi(Axyzz); return) @parallel av_xzi!(R, Axyyz) = (@all(R) = @av_xzi(Axyyz); return) @parallel av_yzi!(R, Axxyz) = (@all(R) = @av_yzi(Axxyz); return) - R.=0; @parallel av!(R, Axyz); @test all(Array(R .== (Axyz[1:end-1,1:end-1,1:end-1].+Axyz[2:end,1:end-1,1:end-1].+Axyz[2:end,2:end,1:end-1].+Axyz[2:end,2:end,2:end].+Axyz[1:end-1,2:end,2:end].+Axyz[1:end-1,1:end-1,2:end].+Axyz[2:end,1:end-1,2:end].+Axyz[1:end-1,2:end,1:end-1])*0.125)) - R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .== (Ax[2:end, :, :].+Ax[1:end-1, :, :]).*0.5)) - R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .== (Ay[ :,2:end, :].+Ay[ :,1:end-1, :]).*0.5)) - R.=0; @parallel av_za!(R, Az); @test all(Array(R .== (Az[ :, :,2:end].+Az[ :, :,1:end-1]).*0.5)) - R.=0; @parallel av_xi!(R, Axyyzz); @test all(Array(R .== (Axyyzz[2:end ,2:end-1,2:end-1].+Axyyzz[1:end-1,2:end-1,2:end-1]).*0.5)) - R.=0; @parallel av_yi!(R, Axxyzz); @test all(Array(R .== (Axxyzz[2:end-1,2:end ,2:end-1].+Axxyzz[2:end-1,1:end-1,2:end-1]).*0.5)) - R.=0; @parallel av_zi!(R, Axxyyz); @test all(Array(R .== (Axxyyz[2:end-1,2:end-1,2:end ].+Axxyyz[2:end-1,2:end-1,1:end-1]).*0.5)) - R.=0; @parallel av_xya!(R, Axy); @test all(Array(R .== (Axy[1:end-1,1:end-1,:].+Axy[2:end,1:end-1,:].+Axy[1:end-1,2:end,:].+Axy[2:end,2:end,:])*0.25)) - R.=0; @parallel av_xza!(R, Axz); @test all(Array(R .== (Axz[1:end-1,:,1:end-1].+Axz[2:end,:,1:end-1].+Axz[1:end-1,:,2:end].+Axz[2:end,:,2:end])*0.25)) - R.=0; @parallel av_yza!(R, Ayz); @test all(Array(R .== (Ayz[:,1:end-1,1:end-1].+Ayz[:,2:end,1:end-1].+Ayz[:,1:end-1,2:end].+Ayz[:,2:end,2:end])*0.25)) - R.=0; @parallel av_xyi!(R, Axyzz); @test all(Array(R .== (Axyzz[1:end-1,1:end-1,2:end-1].+Axyzz[2:end,1:end-1,2:end-1].+Axyzz[1:end-1,2:end,2:end-1].+Axyzz[2:end,2:end,2:end-1])*0.25)) - R.=0; @parallel av_xzi!(R, Axyyz); @test all(Array(R .== (Axyyz[1:end-1,2:end-1,1:end-1].+Axyyz[2:end,2:end-1,1:end-1].+Axyyz[1:end-1,2:end-1,2:end].+Axyyz[2:end,2:end-1,2:end])*0.25)) - R.=0; @parallel av_yzi!(R, Axxyz); @test all(Array(R .== (Axxyz[2:end-1,1:end-1,1:end-1].+Axxyz[2:end-1,2:end,1:end-1].+Axxyz[2:end-1,1:end-1,2:end].+Axxyz[2:end-1,2:end,2:end])*0.25)) + R.=0; @parallel av!(R, Axyz); @test all(Array(R .≈ (Axyz[1:end-1,1:end-1,1:end-1].+Axyz[2:end,1:end-1,1:end-1].+Axyz[2:end,2:end,1:end-1].+Axyz[2:end,2:end,2:end].+Axyz[1:end-1,2:end,2:end].+Axyz[1:end-1,1:end-1,2:end].+Axyz[2:end,1:end-1,2:end].+Axyz[1:end-1,2:end,1:end-1])./8)) + R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .≈ (Ax[2:end, :, :].+Ax[1:end-1, :, :])./2)) + R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .≈ (Ay[ :,2:end, :].+Ay[ :,1:end-1, :])./2)) + R.=0; @parallel av_za!(R, Az); @test all(Array(R .≈ (Az[ :, :,2:end].+Az[ :, :,1:end-1])./2)) + R.=0; @parallel av_xi!(R, Axyyzz); @test all(Array(R .≈ (Axyyzz[2:end ,2:end-1,2:end-1].+Axyyzz[1:end-1,2:end-1,2:end-1])./2)) + R.=0; @parallel av_yi!(R, Axxyzz); @test all(Array(R .≈ (Axxyzz[2:end-1,2:end ,2:end-1].+Axxyzz[2:end-1,1:end-1,2:end-1])./2)) + R.=0; @parallel av_zi!(R, Axxyyz); @test all(Array(R .≈ (Axxyyz[2:end-1,2:end-1,2:end ].+Axxyyz[2:end-1,2:end-1,1:end-1])./2)) + R.=0; @parallel av_xya!(R, Axy); @test all(Array(R .≈ (Axy[1:end-1,1:end-1,:].+Axy[2:end,1:end-1,:].+Axy[1:end-1,2:end,:].+Axy[2:end,2:end,:])./4)) + R.=0; @parallel av_xza!(R, Axz); @test all(Array(R .≈ (Axz[1:end-1,:,1:end-1].+Axz[2:end,:,1:end-1].+Axz[1:end-1,:,2:end].+Axz[2:end,:,2:end])./4)) + R.=0; @parallel av_yza!(R, Ayz); @test all(Array(R .≈ (Ayz[:,1:end-1,1:end-1].+Ayz[:,2:end,1:end-1].+Ayz[:,1:end-1,2:end].+Ayz[:,2:end,2:end])./4)) + R.=0; @parallel av_xyi!(R, Axyzz); @test all(Array(R .≈ (Axyzz[1:end-1,1:end-1,2:end-1].+Axyzz[2:end,1:end-1,2:end-1].+Axyzz[1:end-1,2:end,2:end-1].+Axyzz[2:end,2:end,2:end-1])./4)) + R.=0; @parallel av_xzi!(R, Axyyz); @test all(Array(R .≈ (Axyyz[1:end-1,2:end-1,1:end-1].+Axyyz[2:end,2:end-1,1:end-1].+Axyyz[1:end-1,2:end-1,2:end].+Axyyz[2:end,2:end-1,2:end])./4)) + R.=0; @parallel av_yzi!(R, Axxyz); @test all(Array(R .≈ (Axxyz[2:end-1,1:end-1,1:end-1].+Axxyz[2:end-1,2:end,1:end-1].+Axxyz[2:end-1,1:end-1,2:end].+Axxyz[2:end-1,2:end,2:end])./4)) end; @testset "harmonic averages" begin @parallel harm!(R, Axyz) = (@all(R) = @harm(Axyz); return) @@ -128,44 +136,44 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @parallel harm_xyi!(R, Axyzz) = (@all(R) = @harm_xyi(Axyzz); return) @parallel harm_xzi!(R, Axyyz) = (@all(R) = @harm_xzi(Axyyz); return) @parallel harm_yzi!(R, Axxyz) = (@all(R) = @harm_yzi(Axxyz); return) - R.=0; @parallel harm!(R, Axyz); @test all(Array(R .== 8 ./(1 ./Axyz[1:end-1,1:end-1,1:end-1].+1 ./Axyz[2:end,1:end-1,1:end-1].+1 ./Axyz[2:end,2:end,1:end-1].+1 ./Axyz[2:end,2:end,2:end].+1 ./Axyz[1:end-1,2:end,2:end].+1 ./Axyz[1:end-1,1:end-1,2:end].+1 ./Axyz[2:end,1:end-1,2:end].+1 ./Axyz[1:end-1,2:end,1:end-1]) )) - R.=0; @parallel harm_xa!(R, Ax); @test all(Array(R .== 2 ./(1 ./Ax[2:end, :, :].+1 ./Ax[1:end-1, :, :]))) - R.=0; @parallel harm_ya!(R, Ay); @test all(Array(R .== 2 ./(1 ./Ay[ :,2:end, :].+1 ./Ay[ :,1:end-1, :]))) - R.=0; @parallel harm_za!(R, Az); @test all(Array(R .== 2 ./(1 ./Az[ :, :,2:end].+1 ./Az[ :, :,1:end-1]))) - R.=0; @parallel harm_xi!(R, Axyyzz); @test all(Array(R .== 2 ./(1 ./Axyyzz[2:end ,2:end-1,2:end-1].+1 ./Axyyzz[1:end-1,2:end-1,2:end-1]))) - R.=0; @parallel harm_yi!(R, Axxyzz); @test all(Array(R .== 2 ./(1 ./Axxyzz[2:end-1,2:end ,2:end-1].+1 ./Axxyzz[2:end-1,1:end-1,2:end-1]))) - R.=0; @parallel harm_zi!(R, Axxyyz); @test all(Array(R .== 2 ./(1 ./Axxyyz[2:end-1,2:end-1,2:end ].+1 ./Axxyyz[2:end-1,2:end-1,1:end-1]))) - R.=0; @parallel harm_xya!(R, Axy); @test all(Array(R .== 4 ./(1 ./Axy[1:end-1,1:end-1,:].+1 ./Axy[2:end,1:end-1,:].+1 ./Axy[1:end-1,2:end,:].+1 ./Axy[2:end,2:end,:]))) - R.=0; @parallel harm_xza!(R, Axz); @test all(Array(R .== 4 ./(1 ./Axz[1:end-1,:,1:end-1].+1 ./Axz[2:end,:,1:end-1].+1 ./Axz[1:end-1,:,2:end].+1 ./Axz[2:end,:,2:end]))) - R.=0; @parallel harm_yza!(R, Ayz); @test all(Array(R .== 4 ./(1 ./Ayz[:,1:end-1,1:end-1].+1 ./Ayz[:,2:end,1:end-1].+1 ./Ayz[:,1:end-1,2:end].+1 ./Ayz[:,2:end,2:end]))) - R.=0; @parallel harm_xyi!(R, Axyzz); @test all(Array(R .== 4 ./(1 ./Axyzz[1:end-1,1:end-1,2:end-1].+1 ./Axyzz[2:end,1:end-1,2:end-1].+1 ./Axyzz[1:end-1,2:end,2:end-1].+1 ./Axyzz[2:end,2:end,2:end-1]))) - R.=0; @parallel harm_xzi!(R, Axyyz); @test all(Array(R .== 4 ./(1 ./Axyyz[1:end-1,2:end-1,1:end-1].+1 ./Axyyz[2:end,2:end-1,1:end-1].+1 ./Axyyz[1:end-1,2:end-1,2:end].+1 ./Axyyz[2:end,2:end-1,2:end]))) - R.=0; @parallel harm_yzi!(R, Axxyz); @test all(Array(R .== 4 ./(1 ./Axxyz[2:end-1,1:end-1,1:end-1].+1 ./Axxyz[2:end-1,2:end,1:end-1].+1 ./Axxyz[2:end-1,1:end-1,2:end].+1 ./Axxyz[2:end-1,2:end,2:end]))) + R.=0; @parallel harm!(R, Axyz); @test all(Array(R .≈ 8 ./(1 ./Axyz[1:end-1,1:end-1,1:end-1].+1 ./Axyz[2:end,1:end-1,1:end-1].+1 ./Axyz[2:end,2:end,1:end-1].+1 ./Axyz[2:end,2:end,2:end].+1 ./Axyz[1:end-1,2:end,2:end].+1 ./Axyz[1:end-1,1:end-1,2:end].+1 ./Axyz[2:end,1:end-1,2:end].+1 ./Axyz[1:end-1,2:end,1:end-1]) )) + R.=0; @parallel harm_xa!(R, Ax); @test all(Array(R .≈ 2 ./(1 ./Ax[2:end, :, :].+1 ./Ax[1:end-1, :, :]))) + R.=0; @parallel harm_ya!(R, Ay); @test all(Array(R .≈ 2 ./(1 ./Ay[ :,2:end, :].+1 ./Ay[ :,1:end-1, :]))) + R.=0; @parallel harm_za!(R, Az); @test all(Array(R .≈ 2 ./(1 ./Az[ :, :,2:end].+1 ./Az[ :, :,1:end-1]))) + R.=0; @parallel harm_xi!(R, Axyyzz); @test all(Array(R .≈ 2 ./(1 ./Axyyzz[2:end ,2:end-1,2:end-1].+1 ./Axyyzz[1:end-1,2:end-1,2:end-1]))) + R.=0; @parallel harm_yi!(R, Axxyzz); @test all(Array(R .≈ 2 ./(1 ./Axxyzz[2:end-1,2:end ,2:end-1].+1 ./Axxyzz[2:end-1,1:end-1,2:end-1]))) + R.=0; @parallel harm_zi!(R, Axxyyz); @test all(Array(R .≈ 2 ./(1 ./Axxyyz[2:end-1,2:end-1,2:end ].+1 ./Axxyyz[2:end-1,2:end-1,1:end-1]))) + R.=0; @parallel harm_xya!(R, Axy); @test all(Array(R .≈ 4 ./(1 ./Axy[1:end-1,1:end-1,:].+1 ./Axy[2:end,1:end-1,:].+1 ./Axy[1:end-1,2:end,:].+1 ./Axy[2:end,2:end,:]))) + R.=0; @parallel harm_xza!(R, Axz); @test all(Array(R .≈ 4 ./(1 ./Axz[1:end-1,:,1:end-1].+1 ./Axz[2:end,:,1:end-1].+1 ./Axz[1:end-1,:,2:end].+1 ./Axz[2:end,:,2:end]))) + R.=0; @parallel harm_yza!(R, Ayz); @test all(Array(R .≈ 4 ./(1 ./Ayz[:,1:end-1,1:end-1].+1 ./Ayz[:,2:end,1:end-1].+1 ./Ayz[:,1:end-1,2:end].+1 ./Ayz[:,2:end,2:end]))) + R.=0; @parallel harm_xyi!(R, Axyzz); @test all(Array(R .≈ 4 ./(1 ./Axyzz[1:end-1,1:end-1,2:end-1].+1 ./Axyzz[2:end,1:end-1,2:end-1].+1 ./Axyzz[1:end-1,2:end,2:end-1].+1 ./Axyzz[2:end,2:end,2:end-1]))) + R.=0; @parallel harm_xzi!(R, Axyyz); @test all(Array(R .≈ 4 ./(1 ./Axyyz[1:end-1,2:end-1,1:end-1].+1 ./Axyyz[2:end,2:end-1,1:end-1].+1 ./Axyyz[1:end-1,2:end-1,2:end].+1 ./Axyyz[2:end,2:end-1,2:end]))) + R.=0; @parallel harm_yzi!(R, Axxyz); @test all(Array(R .≈ 4 ./(1 ./Axxyz[2:end-1,1:end-1,1:end-1].+1 ./Axxyz[2:end-1,2:end,1:end-1].+1 ./Axxyz[2:end-1,1:end-1,2:end].+1 ./Axxyz[2:end-1,2:end,2:end]))) end; @testset "others" begin @parallel maxloc!(R, Axxyyzz) = (@all(R) = @maxloc(Axxyyzz); return) - R.=0; @parallel maxloc!(R, Axxyyzz); @test all(Array(R .== max.(max.(max.(max.(max.(max.(Axxyyzz[1:end-2,2:end-1,2:end-1],Axxyyzz[3:end,2:end-1,2:end-1]),Axxyyzz[2:end-1,2:end-1,2:end-1]),Axxyyzz[2:end-1,1:end-2,2:end-1]),Axxyyzz[2:end-1,3:end,2:end-1]),Axxyyzz[2:end-1,2:end-1,1:end-2]),Axxyyzz[2:end-1,2:end-1,3:end]))) + R.=0; @parallel maxloc!(R, Axxyyzz); @test all(Array(R .≈ max.(max.(max.(max.(max.(max.(Axxyyzz[1:end-2,2:end-1,2:end-1],Axxyyzz[3:end,2:end-1,2:end-1]),Axxyyzz[2:end-1,2:end-1,2:end-1]),Axxyyzz[2:end-1,1:end-2,2:end-1]),Axxyyzz[2:end-1,3:end,2:end-1]),Axxyyzz[2:end-1,2:end-1,1:end-2]),Axxyyzz[2:end-1,2:end-1,3:end]))) end; end; @testset "2. apply masks" begin @testset "selection" begin @parallel inn_all!(Rxxyyzz, A) = (@inn(Rxxyyzz) = @all(A); return) @parallel inn_inn!(Rxxyyzz, Axxyyzz) = (@inn(Rxxyyzz) = @inn(Axxyyzz); return) - Rxxyyzz.=0; @parallel inn_all!(Rxxyyzz, A); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .== A)) - Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .== 0)) # Test that boundary values remained zero. - Rxxyyzz.=0; @parallel inn_inn!(Rxxyyzz, Axxyyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .== Axxyyzz[2:end-1,2:end-1,2:end-1])) - Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .== 0)) # Test that boundary values remained zero. + Rxxyyzz.=0; @parallel inn_all!(Rxxyyzz, A); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .≈ A)) + Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .≈ 0)) # Test that boundary values remained zero. + Rxxyyzz.=0; @parallel inn_inn!(Rxxyyzz, Axxyyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .≈ Axxyyzz[2:end-1,2:end-1,2:end-1])) + Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .≈ 0)) # Test that boundary values remained zero. end; @testset "differences" begin @parallel inn_d_xa!(Rxxyyzz, Ax) = (@inn(Rxxyyzz) = @d_xa(Ax); return) @parallel inn_d_yi!(Rxxyyzz, Axxyzz) = (@inn(Rxxyyzz) = @d_yi(Axxyzz); return) @parallel inn_d2_yi!(Rxxyyzz, Axxyyzz) = (@inn(Rxxyyzz) = @d2_yi(Axxyyzz); return) - Rxxyyzz.=0; @parallel inn_d_xa!(Rxxyyzz, Ax); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .== Ax[2:end, :, :].-Ax[1:end-1, :, :])) - Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .== 0)) # Test that boundary values remained zero. - Rxxyyzz.=0; @parallel inn_d_yi!(Rxxyyzz, Axxyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .== Axxyzz[2:end-1,2:end ,2:end-1].-Axxyzz[2:end-1,1:end-1,2:end-1])) - Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .== 0)) # Test that boundary values remained zero. - Rxxyyzz.=0; @parallel inn_d2_yi!(Rxxyyzz, Axxyyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .== (Axxyyzz[2:end-1,3:end,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,1:end-2,2:end-1]))) - Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .== 0)) # Test that boundary values remained zero. + Rxxyyzz.=0; @parallel inn_d_xa!(Rxxyyzz, Ax); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .≈ Ax[2:end, :, :].-Ax[1:end-1, :, :])) + Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .≈ 0)) # Test that boundary values remained zero. + Rxxyyzz.=0; @parallel inn_d_yi!(Rxxyyzz, Axxyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .≈ Axxyzz[2:end-1,2:end ,2:end-1].-Axxyzz[2:end-1,1:end-1,2:end-1])) + Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .≈ 0)) # Test that boundary values remained zero. + Rxxyyzz.=0; @parallel inn_d2_yi!(Rxxyyzz, Axxyyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .≈ (Axxyyzz[2:end-1,3:end,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,1:end-2,2:end-1]))) + Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .≈ 0)) # Test that boundary values remained zero. end; end; @reset_parallel_stencil() diff --git a/test/test_extensions.jl b/test/test_extensions.jl index cd929f4c..b9a47ec9 100644 --- a/test/test_extensions.jl +++ b/test/test_extensions.jl @@ -14,6 +14,9 @@ end import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end exename = joinpath(Sys.BINDIR, Base.julia_exename()) const TEST_PROJECTS = ["Diffusion3D_minimal"] # ["Diffusion3D_minimal", "Diffusion3D", "Diffusion"] diff --git a/test/test_incremental_compilation.jl b/test/test_incremental_compilation.jl index 7a02acea..0a82ddf0 100644 --- a/test/test_incremental_compilation.jl +++ b/test/test_incremental_compilation.jl @@ -1,5 +1,5 @@ using Test -import ParallelStencil: SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_POLYESTER +import ParallelStencil: SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_POLYESTER TEST_PACKAGES = SUPPORTED_PACKAGES @static if PKG_CUDA in TEST_PACKAGES import CUDA diff --git a/test/test_init_parallel_stencil.jl b/test/test_init_parallel_stencil.jl index 2370fd65..6f8e168d 100644 --- a/test/test_init_parallel_stencil.jl +++ b/test/test_init_parallel_stencil.jl @@ -1,6 +1,6 @@ using Test using ParallelStencil -import ParallelStencil: @reset_parallel_stencil, @is_initialized, @get_package, @get_numbertype, @get_ndims, @get_inbounds, @get_memopt, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_NONE, NUMBERTYPE_NONE, NDIMS_NONE +import ParallelStencil: @reset_parallel_stencil, @is_initialized, @get_package, @get_numbertype, @get_ndims, @get_inbounds, @get_memopt, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_POLYESTER, PKG_NONE, NUMBERTYPE_NONE, NDIMS_NONE import ParallelStencil: @require, @symbols import ParallelStencil: extract_posargs_init, extract_kwargs_init, check_already_initialized, set_initialized, is_initialized, check_initialized, set_package, set_numbertype, set_ndims, set_inbounds, set_memopt using ParallelStencil.Exceptions @@ -17,6 +17,9 @@ end import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( diff --git a/test/test_parallel.jl b/test/test_parallel.jl index 59ae434d..63e0372f 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -1,6 +1,6 @@ using Test using ParallelStencil -import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_THREADS, PKG_POLYESTER, INDICES +import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_THREADS, PKG_POLYESTER, INDICES import ParallelStencil: @require, @prettystring, @gorgeousstring, @isgpu, @iscpu import ParallelStencil: checkargs_parallel, validate_body, parallel using ParallelStencil.Exceptions @@ -19,6 +19,9 @@ end import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. import ParallelStencil.@gorgeousexpand @@ -27,7 +30,12 @@ import ParallelStencil.@gorgeousexpand @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin @testset "1. parallel macros" begin @require !@is_initialized() - @init_parallel_stencil($package, Float64, 3) + # @static if $package == $PKG_METAL + # @init_parallel_stencil($package, Float32, 3) + # else + # @init_parallel_stencil($package, Float64, 3) + # end + @init_parallel_stencil($package, Float32, 3) @require @is_initialized() @testset "@parallel " begin # NOTE: calls must go to ParallelStencil.ParallelKernel.parallel and must therefore give the same result as in ParallelKernel, except for memopt tests (tests copied 1-to-1 from there). @static if $package == $PKG_CUDA @@ -170,17 +178,17 @@ import ParallelStencil.@gorgeousexpand return end @parallel write_indices!(A); - @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end @testset "@parallel (3D; on-the-fly)" begin nx, ny, nz = 32, 8, 8 - lam=dt=_dx=_dy=_dz = 1.0 + lam=dt=_dx=_dy=_dz = 1.0f0 T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); Ci = @ones(nx, ny, nz); copy!(T, [ix + (iy-1)*size(T,1) + (iz-1)*size(T,1)*size(T,2) for ix=1:size(T,1), iy=1:size(T,2), iz=1:size(T,3)].^3); - @parallel function diffusion3D_step!(T2, T, Ci, lam::Data.Number, dt::Float64, _dx, _dy, _dz) + @parallel function diffusion3D_step!(T2, T, Ci, lam, dt, _dx, _dy, _dz) @all(qx) = -lam*@d_xi(T)*_dx # Fourier's law of heat conduction @all(qy) = -lam*@d_yi(T)*_dy # ... @all(qz) = -lam*@d_zi(T)*_dz # ... @@ -194,7 +202,7 @@ import ParallelStencil.@gorgeousexpand + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @static if $package in [$PKG_CUDA, $PKG_AMDGPU] @testset "@parallel memopt (nx, ny, nz = x .* threads)" begin # NOTE: the following does not work for some reason: (nx, ny, nz = ($nx, $ny, $nz))" for (nx, ny, nz) in ((32, 8, 9), (32, 8, 8), (31, 7, 9), (33, 9, 9), (33, 7, 8)) @@ -211,7 +219,7 @@ import ParallelStencil.@gorgeousexpand return end @parallel memopt=true copy_memopt!(A2, A); - @test all(Array(A2) .== Array(A)) + @test all(Array(A2) .≈ Array(A)) end @testset "@parallel (3D, memopt, stencilranges=0:0)" begin A = @zeros(nx, ny, nz); @@ -222,7 +230,7 @@ import ParallelStencil.@gorgeousexpand return end @parallel memopt=true copy_memopt!(A2, A); - @test all(Array(A2) .== Array(A)) + @test all(Array(A2) .≈ Array(A)) end @testset "@parallel_indices (3D, memopt, stencilranges=(0:0, 0:0, -1:1); z-stencil)" begin A = @zeros(nx, ny, nz); @@ -237,7 +245,7 @@ import ParallelStencil.@gorgeousexpand end @parallel memopt=true d2_memopt!(A2, A); A2_ref[:,:,2:end-1] .= A[:,:,3:end] .- 2.0.*A[:,:,2:end-1] .+ A[:,:,1:end-2]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(0:0, -1:1, 0:0); y-stencil)" begin A = @zeros(nx, ny, nz); @@ -252,7 +260,7 @@ import ParallelStencil.@gorgeousexpand end @parallel memopt=true d2_memopt!(A2, A); A2_ref[:,2:end-1,:] .= A[:,3:end,:] .- 2.0.*A[:,2:end-1,:] .+ A[:,1:end-2,:]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=(1:1, 1:1, 0:2); z-stencil)" begin A = @zeros(nx, ny, nz); @@ -265,7 +273,7 @@ import ParallelStencil.@gorgeousexpand end @parallel memopt=true d2_memopt!(A2, A); A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,2:end-1,3:end] .- 2.0.*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,2:end-1,1:end-2]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=(1:1, 0:2, 1:1); y-stencil)" begin A = @zeros(nx, ny, nz); @@ -278,7 +286,7 @@ import ParallelStencil.@gorgeousexpand end @parallel memopt=true d2_memopt!(A2, A); A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,3:end,2:end-1] .- 2.0.*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,1:end-2,2:end-1]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=-1:1)" begin lam=dt=_dx=_dy=_dz = 1.0 @@ -303,7 +311,7 @@ import ParallelStencil.@gorgeousexpand - ((.-lam.*(T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]).*_dy) .- (.-lam.*(T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1]).*_dy)).*_dy - ((.-lam.*(T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]).*_dz) .- (.-lam.*(T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2]).*_dz)).*_dz) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:2)" begin lam=dt=_dx=_dy=_dz = 1.0 @@ -322,7 +330,7 @@ import ParallelStencil.@gorgeousexpand + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(-4:-1, 2:2, -2:3); x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -337,7 +345,7 @@ import ParallelStencil.@gorgeousexpand end @parallel memopt=true higher_order_memopt!(A2, A); A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:2; on-the-fly)" begin lam=dt=_dx=_dy=_dz = 1.0 @@ -360,7 +368,7 @@ import ParallelStencil.@gorgeousexpand + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:0; 2 arrays)" begin A = @zeros(nx, ny, nz); @@ -373,7 +381,7 @@ import ParallelStencil.@gorgeousexpand return end @parallel memopt=true copy_memopt!(A2, A, B); - @test all(Array(A2) .== Array(A) .+ Array(B)) + @test all(Array(A2) .≈ Array(A) .+ Array(B)) end @testset "@parallel_indices (3D, memopt; 2 arrays, z-stencil)" begin A = @zeros(nx, ny, nz); @@ -390,7 +398,7 @@ import ParallelStencil.@gorgeousexpand end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[:,:,2:end-1] .= A[:,:,3:end] .- 2.0.*A[:,:,2:end-1] .+ A[:,:,1:end-2] .+ B[:,:,2:end] .- B[:,:,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel_indices (3D, memopt; 2 arrays, y-stencil)" begin A = @zeros(nx, ny, nz); @@ -407,7 +415,7 @@ import ParallelStencil.@gorgeousexpand end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[:,2:end-1,:] .= (((A[:,3:end,:] .- 2.0.*A[:,2:end-1,:]) .+ A[:,1:end-2,:] .+ B[:,3:end,:]) .- 2.0.*B[:,2:end-1,:]) .+ B[:,1:end-2,:]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel_indices (3D, memopt; 2 arrays, x-stencil)" begin A = @zeros(nx, ny, nz); @@ -424,7 +432,7 @@ import ParallelStencil.@gorgeousexpand end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[2:end-1,:,:] .= A[3:end,:,:] .- 2.0.*A[2:end-1,:,:] .+ A[1:end-2,:,:] .+ B[2:end,:,:] .- B[1:end-1,:,:]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt; 2 arrays, x-y-z- + z-stencil)" begin lam=dt=_dx=_dy=_dz = 1.0 @@ -444,7 +452,7 @@ import ParallelStencil.@gorgeousexpand + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 2 arrays, x-y-z- + x-stencil)" begin lam=dt=_dx=_dy=_dz = 1.0 @@ -464,7 +472,7 @@ import ParallelStencil.@gorgeousexpand + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin lam=dt=_dx=_dy=_dz = 1.0 @@ -486,7 +494,7 @@ import ParallelStencil.@gorgeousexpand + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ) + ((B[2:end-1,3:end ,2:end-1] .- B[2:end-1,2:end-1,2:end-1]) .- (B[2:end-1,2:end-1,2:end-1] .- B[2:end-1,1:end-2,2:end-1])); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(-4:-1, 2:2, -2:3); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -517,9 +525,9 @@ import ParallelStencil.@gorgeousexpand A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[5:end-1,3:end,3:end-3] .= B[5:end-1,3:end,6:end] .- 2.0.*B[3:end-3,3:end,3:end-3] .+ B[2:end-4,3:end,1:end-5]; C2_ref[5:end-1,3:end,3:end-3] .= C[5:end-1,3:end,6:end] .- 2.0.*C[3:end-3,3:end,3:end-3] .+ C[2:end-4,3:end,1:end-5]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:-1, 2:2, 1:2), C=(-4:-1, 2:2, -1:0)); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -550,9 +558,9 @@ import ParallelStencil.@gorgeousexpand A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[5:end-1,3:end,2:end-1] .= B[5:end-1,3:end,3:end] .- 2.0.*B[3:end-3,3:end,2:end-1] .+ B[2:end-4,3:end,2:end-1]; C2_ref[5:end-1,3:end,1:end-1] .= C[5:end-1,3:end,2:end] .- 2.0.*C[3:end-3,3:end,1:end-1] .+ C[2:end-4,3:end,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -583,9 +591,9 @@ import ParallelStencil.@gorgeousexpand A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2.0.*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2.0.*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, C), loopdim=3, loopsize=3, optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -638,9 +646,9 @@ import ParallelStencil.@gorgeousexpand A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2.0.*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2.0.*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, C), loopdim=3, loopsize=3, optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -693,9 +701,9 @@ import ParallelStencil.@gorgeousexpand A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2.0.*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2.0.*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, B), loopdim=3, loopsize=3, optranges=(A=(-1:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:1)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -741,9 +749,9 @@ import ParallelStencil.@gorgeousexpand A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2.0.*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2.0.*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end end @testset "@parallel memopt (nx, ny, nz != x .* threads)" begin @@ -760,7 +768,7 @@ import ParallelStencil.@gorgeousexpand end ranges = (1:64,1:64,1:8) # TODO: must be a multiple of the number of threads @parallel ranges memopt=true copy_memopt!(A2, A); - @test all(Array(A2) .== Array(A)) + @test all(Array(A2) .≈ Array(A)) end @testset "@parallel (3D, memopt, stencilranges=0:0)" begin A = @zeros(nx, ny, nz); @@ -771,7 +779,7 @@ import ParallelStencil.@gorgeousexpand return end @parallel memopt=true copy_memopt!(A2, A); - @test all(Array(A2) .== Array(A)) + @test all(Array(A2) .≈ Array(A)) end @testset "@parallel (3D, memopt, stencilranges=0:2)" begin lam=dt=_dx=_dy=_dz = 1.0 @@ -790,7 +798,7 @@ import ParallelStencil.@gorgeousexpand + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin lam=dt=_dx=_dy=_dz = 1.0 @@ -812,7 +820,7 @@ import ParallelStencil.@gorgeousexpand + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ) + ((B[2:end-1,3:end ,2:end-1] .- B[2:end-1,2:end-1,2:end-1]) .- (B[2:end-1,2:end-1,2:end-1] .- B[2:end-1,1:end-2,2:end-1])); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end end end @@ -826,7 +834,12 @@ import ParallelStencil.@gorgeousexpand end; @testset "2. parallel macros (2D)" begin @require !@is_initialized() - @init_parallel_stencil($package, Float64, 3) + # @static if $package == $PKG_METAL + # @init_parallel_stencil($package, Float32, 2) + # else + # @init_parallel_stencil($package, Float64, 2) + # end + @init_parallel_stencil($package, Float32, 2) @require @is_initialized() @static if $package in [$PKG_CUDA, $PKG_AMDGPU] nx, ny, nz = 32, 8, 1 @@ -851,7 +864,7 @@ import ParallelStencil.@gorgeousexpand - ((.-lam.*(T[3:end ,2:end-1,1] .- T[2:end-1,2:end-1,1]).*_dx) .- (.-lam.*(T[2:end-1,2:end-1,1] .- T[1:end-2,2:end-1,1]).*_dx)).*_dx - ((.-lam.*(T[2:end-1,3:end ,1] .- T[2:end-1,2:end-1,1]).*_dy) .- (.-lam.*(T[2:end-1,2:end-1,1] .- T[2:end-1,1:end-2,1]).*_dy)).*_dy) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end; end; @reset_parallel_stencil() @@ -859,7 +872,12 @@ import ParallelStencil.@gorgeousexpand @testset "3. global defaults" begin @testset "inbounds=true" begin @require !@is_initialized() - @init_parallel_stencil($package, Float64, 1, inbounds=true) + # @static if $package == $PKG_METAL + # @init_parallel_stencil($package, Float32, 1, inbounds=true) + # else + # @init_parallel_stencil($package, Float64, 1, inbounds=true) + # end + @init_parallel_stencil($package, Float32, 1, inbounds=true) @require @is_initialized expansion = @prettystring(1, @parallel_indices (ix) inbounds=true f(A) = (2*A; return)) @test occursin("Base.@inbounds begin", expansion) @@ -871,7 +889,7 @@ import ParallelStencil.@gorgeousexpand end; @testset "@parallel_indices (I...) (1D)" begin @require !@is_initialized() - @init_parallel_stencil($package, Float64, 1) + @init_parallel_stencil($package, Float32, 1) @require @is_initialized A = @zeros(4*5*6) @parallel_indices (I...) function write_indices!(A) @@ -879,12 +897,17 @@ import ParallelStencil.@gorgeousexpand return end @parallel write_indices!(A); - @test all(Array(A) .== [(ix-1) for ix=1:size(A,1)]) + @test all(Array(A) .≈ [(ix-1) for ix=1:size(A,1)]) @reset_parallel_stencil() end; @testset "@parallel_indices (I...) (2D)" begin @require !@is_initialized() - @init_parallel_stencil($package, Float64, 2) + # @static if $package == $PKG_METAL + # @init_parallel_stencil($package, Float32, 2) + # else + # @init_parallel_stencil($package, Float64, 2) + # end + @init_parallel_stencil($package, Float32, 2) @require @is_initialized A = @zeros(4, 5*6) @parallel_indices (I...) function write_indices!(A) @@ -892,12 +915,17 @@ import ParallelStencil.@gorgeousexpand return end @parallel write_indices!(A); - @test all(Array(A) .== [(ix-1) + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) + @test all(Array(A) .≈ [(ix-1) + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) @reset_parallel_stencil() end; @testset "@parallel_indices (I...) (3D)" begin @require !@is_initialized() - @init_parallel_stencil($package, Float64, 3) + # @static if $package == $PKG_METAL + # @init_parallel_stencil($package, Float32, 3) + # else + # @init_parallel_stencil($package, Float64, 3) + # end + @init_parallel_stencil($package, Float32, 3) @require @is_initialized A = @zeros(4, 5, 6) @parallel_indices (I...) function write_indices!(A) @@ -905,7 +933,7 @@ import ParallelStencil.@gorgeousexpand return end @parallel write_indices!(A); - @test all(Array(A) .== [(ix-1) + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .≈ [(ix-1) + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) @reset_parallel_stencil() end; end; @@ -1004,7 +1032,12 @@ import ParallelStencil.@gorgeousexpand @reset_parallel_stencil() end; @testset "5. Exceptions" begin - @init_parallel_stencil($package, Float64, 3) + # @static if $package == $PKG_METAL + # @init_parallel_stencil($package, Float32, 3) + # else + # @init_parallel_stencil($package, Float64, 3) + # end + @init_parallel_stencil($package, Float32, 3) @require @is_initialized @testset "arguments @parallel" begin @test_throws ArgumentError checkargs_parallel(); # Error: isempty(args) diff --git a/test/test_projects/Diffusion3D_minimal/test/localtest_diffusion_Metal.jl b/test/test_projects/Diffusion3D_minimal/test/localtest_diffusion_Metal.jl new file mode 100644 index 00000000..2f2df9e0 --- /dev/null +++ b/test/test_projects/Diffusion3D_minimal/test/localtest_diffusion_Metal.jl @@ -0,0 +1,8 @@ +push!(LOAD_PATH, "@stdlib") # NOTE: this is needed to enable this test to run from the Pkg manager +push!(LOAD_PATH, joinpath(@__DIR__, "..")) +using Test +using Pkg +Pkg.activate(joinpath(@__DIR__, "..")) +Pkg.instantiate() +import Metal +using Diffusion3D_minimal diff --git a/test/test_reset_parallel_stencil.jl b/test/test_reset_parallel_stencil.jl index 870f46c3..d160537e 100644 --- a/test/test_reset_parallel_stencil.jl +++ b/test/test_reset_parallel_stencil.jl @@ -1,6 +1,6 @@ using Test using ParallelStencil -import ParallelStencil: @reset_parallel_stencil, @is_initialized, @get_package, @get_numbertype, @get_ndims, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_NONE, NUMBERTYPE_NONE, NDIMS_NONE +import ParallelStencil: @reset_parallel_stencil, @is_initialized, @get_package, @get_numbertype, @get_ndims, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_POLYESTER, PKG_NONE, NUMBERTYPE_NONE, NDIMS_NONE import ParallelStencil: @require, @symbols TEST_PACKAGES = SUPPORTED_PACKAGES @static if PKG_CUDA in TEST_PACKAGES @@ -15,6 +15,9 @@ end import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end +@static if PKG_POLYESTER in TEST_PACKAGES + import Polyester +end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. @static for package in TEST_PACKAGES eval(:( From 0a5858c6ebafa35e5fe21a0a16e25266aacf9e7f Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 9 Oct 2024 13:41:56 +0200 Subject: [PATCH 18/46] Replacing multiplications with floating point with division by integer (WIP) --- src/FiniteDifferences.jl | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/FiniteDifferences.jl b/src/FiniteDifferences.jl index 584e92dd..4ccc77c0 100644 --- a/src/FiniteDifferences.jl +++ b/src/FiniteDifferences.jl @@ -167,12 +167,12 @@ macro all(A) @expandargs(A); esc(:( $A[$ix ,$iy ] )) end macro inn(A) @expandargs(A); esc(:( $A[$ixi ,$iyi ] )) end macro inn_x(A) @expandargs(A); esc(:( $A[$ixi ,$iy ] )) end macro inn_y(A) @expandargs(A); esc(:( $A[$ix ,$iyi ] )) end -macro av(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] + $A[$ix,$iy+1] + $A[$ix+1,$iy+1])*0.25 )) end +macro av(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] + $A[$ix,$iy+1] + $A[$ix+1,$iy+1])/4 )) end macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] )/2 )) end macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix ,$iy+1] )/2 )) end macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ] + $A[$ix+1,$iyi ] )/2 )) end macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ] + $A[$ixi ,$iy+1] )/2 )) end -macro harm(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ] + 1/$A[$ix+1,$iy ] + 1/$A[$ix,$iy+1] + 1/$A[$ix+1,$iy+1])*4.0 )) end +macro harm(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ] + 1/$A[$ix+1,$iy ] + 1/$A[$ix,$iy+1] + 1/$A[$ix+1,$iy+1])*4 )) end macro harm_xa(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ] + 1/$A[$ix+1,$iy ] )*2 )) end macro harm_ya(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ] + 1/$A[$ix ,$iy+1] )*2 )) end macro harm_xi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iyi ] + 1/$A[$ix+1,$iyi ] )*2 )) end @@ -342,7 +342,7 @@ macro inn_yz(A) @expandargs(A); esc(:( $A[$ix ,$iyi ,$izi ] )) end macro av(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] + $A[$ix+1,$iy+1,$iz ] + $A[$ix+1,$iy+1,$iz+1] + $A[$ix ,$iy+1,$iz+1] + $A[$ix ,$iy ,$iz+1] + - $A[$ix+1,$iy ,$iz+1] + $A[$ix ,$iy+1,$iz ] )*0.125)) end + $A[$ix+1,$iy ,$iz+1] + $A[$ix ,$iy+1,$iz ] )/8)) end macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] )/2 )) end macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy+1,$iz ] )/2 )) end macro av_za(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy ,$iz+1] )/2 )) end @@ -350,21 +350,21 @@ macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ,$izi ] + $A[$ix+1,$iyi macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ,$izi ] + $A[$ixi ,$iy+1,$izi ] )/2 )) end macro av_zi(A) @expandargs(A); esc(:(($A[$ixi ,$iyi ,$iz ] + $A[$ixi ,$iyi ,$iz+1] )/2 )) end macro av_xya(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] + - $A[$ix ,$iy+1,$iz ] + $A[$ix+1,$iy+1,$iz ] )*0.25 )) end + $A[$ix ,$iy+1,$iz ] + $A[$ix+1,$iy+1,$iz ] )/4 )) end macro av_xza(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] + - $A[$ix ,$iy ,$iz+1] + $A[$ix+1,$iy ,$iz+1] )*0.25 )) end + $A[$ix ,$iy ,$iz+1] + $A[$ix+1,$iy ,$iz+1] )/4 )) end macro av_yza(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy+1,$iz ] + - $A[$ix ,$iy ,$iz+1] + $A[$ix ,$iy+1,$iz+1] )*0.25 )) end + $A[$ix ,$iy ,$iz+1] + $A[$ix ,$iy+1,$iz+1] )/4 )) end macro av_xyi(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$izi ] + $A[$ix+1,$iy ,$izi ] + - $A[$ix ,$iy+1,$izi ] + $A[$ix+1,$iy+1,$izi ] )*0.25 )) end + $A[$ix ,$iy+1,$izi ] + $A[$ix+1,$iy+1,$izi ] )/4 )) end macro av_xzi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ,$iz ] + $A[$ix+1,$iyi ,$iz ] + - $A[$ix ,$iyi ,$iz+1] + $A[$ix+1,$iyi ,$iz+1] )*0.25 )) end + $A[$ix ,$iyi ,$iz+1] + $A[$ix+1,$iyi ,$iz+1] )/4 )) end macro av_yzi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ,$iz ] + $A[$ixi ,$iy+1,$iz ] + - $A[$ixi ,$iy ,$iz+1] + $A[$ixi ,$iy+1,$iz+1] )*0.25 )) end + $A[$ixi ,$iy ,$iz+1] + $A[$ixi ,$iy+1,$iz+1] )/4 )) end macro harm(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + 1/$A[$ix+1,$iy+1,$iz ] + 1/$A[$ix+1,$iy+1,$iz+1] + 1/$A[$ix ,$iy+1,$iz+1] + 1/$A[$ix ,$iy ,$iz+1] + - 1/$A[$ix+1,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz ] )*8.0)) end + 1/$A[$ix+1,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz ] )*8)) end macro harm_xa(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] )*2 )) end macro harm_ya(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy+1,$iz ] )*2 )) end macro harm_za(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy ,$iz+1] )*2 )) end @@ -372,17 +372,17 @@ macro harm_xi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iyi ,$izi ] + 1/$A[$ix+ macro harm_yi(A) @expandargs(A); esc(:(1/(1/$A[$ixi ,$iy ,$izi ] + 1/$A[$ixi ,$iy+1,$izi ] )*2 )) end macro harm_zi(A) @expandargs(A); esc(:(1/(1/$A[$ixi ,$iyi ,$iz ] + 1/$A[$ixi ,$iyi ,$iz+1] )*2 )) end macro harm_xya(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + - 1/$A[$ix ,$iy+1,$iz ] + 1/$A[$ix+1,$iy+1,$iz ] )*4.0 )) end + 1/$A[$ix ,$iy+1,$iz ] + 1/$A[$ix+1,$iy+1,$iz ] )*4 )) end macro harm_xza(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + - 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix+1,$iy ,$iz+1] )*4.0 )) end + 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix+1,$iy ,$iz+1] )*4 )) end macro harm_yza(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy+1,$iz ] + - 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz+1] )*4.0 )) end + 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz+1] )*4 )) end macro harm_xyi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$izi ] + 1/$A[$ix+1,$iy ,$izi ] + - 1/$A[$ix ,$iy+1,$izi ] + 1/$A[$ix+1,$iy+1,$izi ] )*4.0 )) end + 1/$A[$ix ,$iy+1,$izi ] + 1/$A[$ix+1,$iy+1,$izi ] )*4 )) end macro harm_xzi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iyi ,$iz ] + 1/$A[$ix+1,$iyi ,$iz ] + - 1/$A[$ix ,$iyi ,$iz+1] + 1/$A[$ix+1,$iyi ,$iz+1] )*4.0 )) end + 1/$A[$ix ,$iyi ,$iz+1] + 1/$A[$ix+1,$iyi ,$iz+1] )*4 )) end macro harm_yzi(A) @expandargs(A); esc(:(1/(1/$A[$ixi ,$iy ,$iz ] + 1/$A[$ixi ,$iy+1,$iz ] + - 1/$A[$ixi ,$iy ,$iz+1] + 1/$A[$ixi ,$iy+1,$iz+1] )*4.0 )) end + 1/$A[$ixi ,$iy ,$iz+1] + 1/$A[$ixi ,$iy+1,$iz+1] )*4 )) end macro maxloc(A) @expandargs(A); esc(:( max( max( max( max($A[$ixi-1,$iyi ,$izi ], $A[$ixi+1,$iyi ,$izi ]) , $A[$ixi ,$iyi ,$izi ] ), max($A[$ixi ,$iyi-1,$izi ], $A[$ixi ,$iyi+1,$izi ]) ), max($A[$ixi ,$iyi ,$izi-1], $A[$ixi ,$iyi ,$izi+1]) ) )) end From 16105a51c97b114feffbd5cfe37412af0dc576e6 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 9 Oct 2024 15:40:01 +0200 Subject: [PATCH 19/46] Add multiple precisions testing and fix literals --- test/ParallelKernel/test_allocators.jl | 8 +- .../ParallelKernel/test_hide_communication.jl | 17 +- test/ParallelKernel/test_kernel_language.jl | 35 ++- test/ParallelKernel/test_parallel.jl | 79 +++--- test/test_FiniteDifferences1D.jl | 21 +- test/test_FiniteDifferences2D.jl | 15 +- test/test_FiniteDifferences3D.jl | 20 +- test/test_parallel.jl | 246 ++++++++---------- 8 files changed, 220 insertions(+), 221 deletions(-) diff --git a/test/ParallelKernel/test_allocators.jl b/test/ParallelKernel/test_allocators.jl index 767d1333..a4abfa4b 100644 --- a/test/ParallelKernel/test_allocators.jl +++ b/test/ParallelKernel/test_allocators.jl @@ -28,7 +28,9 @@ end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not work in combination with @reset_parallel_kernel, because the macros from module Test alternate the order of evaluation, resulting in the Data module being replaced with an empty module before Data.Index is evaluated. If at some point the indexing varies depending on the used package, then something more sophisticated is needed here (e.g., wrapping the test for each package in a module and using then Data.Index everywhere). -@static for package in TEST_PACKAGES eval(:( +for package in TEST_PACKAGES + +eval(:( @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin @testset "1. @CellType macro" begin @require !@is_initialized() @@ -553,4 +555,6 @@ const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not @reset_parallel_kernel() end; end; -)) end == nothing || true; +)) + +end == nothing || true; diff --git a/test/ParallelKernel/test_hide_communication.jl b/test/ParallelKernel/test_hide_communication.jl index 402b61e7..6c7c7704 100644 --- a/test/ParallelKernel/test_hide_communication.jl +++ b/test/ParallelKernel/test_hide_communication.jl @@ -23,11 +23,16 @@ end end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. -@static for package in TEST_PACKAGES eval(:( - @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin +const TEST_PRECISIONS = [Float32, Float64] +for package in TEST_PACKAGES +for precision in TEST_PRECISIONS +(package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 + +eval(:( + @testset "$(basename(@__FILE__)) (package: $(nameof($package))) (precision: $(nameof($precision)))" begin @testset "1. hide_communication macro" begin @require !@is_initialized() - @init_parallel_kernel($package, Float32) + @init_parallel_kernel($package, $precision) @require @is_initialized() @testset "@hide_communication boundary_width block (macro expansion)" begin @static if @isgpu($package) @@ -171,7 +176,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t end; @testset "2. Exceptions" begin @require !@is_initialized() - @init_parallel_kernel($package, Float32) + @init_parallel_kernel($package, $precision) @require @is_initialized @testset "arguments @hide_communication" begin @test_throws ArgumentError checkargs_hide_communication(:boundary_width, :block) # Error: the last argument must be a code block. @@ -211,4 +216,6 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @reset_parallel_kernel() end; end; -)) end == nothing || true; +)) + +end end == nothing || true; diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl index eb56a91b..8cc48b37 100644 --- a/test/ParallelKernel/test_kernel_language.jl +++ b/test/ParallelKernel/test_kernel_language.jl @@ -23,16 +23,16 @@ end end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. -@static for package in TEST_PACKAGES eval(:( - @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin +const TEST_PRECISIONS = [Float32, Float64] +for package in TEST_PACKAGES +for precision in TEST_PRECISIONS +(package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 + +eval(:( + @testset "$(basename(@__FILE__)) (package: $(nameof($package))) (precision: $(nameof($precision)))" begin @testset "1. kernel language macros" begin @require !@is_initialized() - # @static if $package == $PKG_METAL - # @init_parallel_kernel($package, Float32) - # else - # @init_parallel_kernel($package, Float64) - # end - @init_parallel_kernel($package, Float32) + @init_parallel_kernel($package, $precision) @require @is_initialized() @testset "mapping to package" begin if $package == $PKG_CUDA @@ -41,7 +41,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @test @prettystring(1, @blockDim()) == "CUDA.blockDim()" @test @prettystring(1, @threadIdx()) == "CUDA.threadIdx()" @test @prettystring(1, @sync_threads()) == "CUDA.sync_threads()" - @test @prettystring(1, @sharedMem(Float32, (2,3))) == "CUDA.@cuDynamicSharedMem Float32 (2, 3)" + @test @prettystring(1, @sharedMem($precision, (2,3))) == "CUDA.@cuDynamicSharedMem $precision (2, 3)" # @test @prettystring(1, @pk_show()) == "CUDA.@cushow" # @test @prettystring(1, @pk_println()) == "CUDA.@cuprintln" elseif $package == $AMDGPU @@ -59,7 +59,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @test @prettystring(1, @blockDim()) == "Metal.threads_per_threadgroup_3d()" @test @prettystring(1, @threadIdx()) == "Metal.thread_position_in_threadgroup_3d()" @test @prettystring(1, @sync_threads()) == "Metal.threadgroup_barrier(; flag = Metal.MemoryFlagThreadGroup)" - @test @prettystring(1, @sharedMem(Float32, (2,3))) == "ParallelStencil.ParallelKernel.@sharedMem_metal Float32 (2, 3)" + @test @prettystring(1, @sharedMem($precision, (2,3))) == "ParallelStencil.ParallelKernel.@sharedMem_metal $(nameof($precision)) (2, 3)" # @test @prettystring(1, @pk_show()) == "Metal.@mtlshow" # @test @prettystring(1, @pk_println()) == "Metal.@mtlprintln" elseif @iscpu($package) @@ -68,7 +68,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @test @prettystring(1, @blockDim()) == "ParallelStencil.ParallelKernel.@blockDim_cpu" @test @prettystring(1, @threadIdx()) == "ParallelStencil.ParallelKernel.@threadIdx_cpu" @test @prettystring(1, @sync_threads()) == "ParallelStencil.ParallelKernel.@sync_threads_cpu" - @test @prettystring(1, @sharedMem(Float32, (2,3))) == "ParallelStencil.ParallelKernel.@sharedMem_cpu Float32 (2, 3)" + @test @prettystring(1, @sharedMem($precision, (2,3))) == "ParallelStencil.ParallelKernel.@sharedMem_cpu $(nameof($precision)) (2, 3)" # @test @prettystring(1, @pk_show()) == "Base.@show" # @test @prettystring(1, @pk_println()) == "Base.println()" end; @@ -138,7 +138,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t end; @testset "shared memory (allocation)" begin @static if @iscpu($package) - @test typeof(@sharedMem(Float32,(2,3))) == typeof(ParallelStencil.ParallelKernel.MArray{Tuple{2,3}, Float32, length((2,3)), prod((2,3))}(undef)) + @test typeof(@sharedMem($precision,(2,3))) == typeof(ParallelStencil.ParallelKernel.MArray{Tuple{2,3}, $precision, length((2,3)), prod((2,3))}(undef)) @test typeof(@sharedMem(Bool,(2,3,4))) == typeof(ParallelStencil.ParallelKernel.MArray{Tuple{2,3,4}, Bool, length((2,3,4)), prod((2,3,4))}(undef)) end; end; @@ -214,12 +214,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @reset_parallel_kernel() end; @testset "2. Exceptions" begin - # @static if $package == $PKG_METAL - # @init_parallel_kernel($package, Float32) - # else - # @init_parallel_kernel($package, Float64) - # end - @init_parallel_kernel($package, Float32) + @init_parallel_kernel($package, $precision) @require @is_initialized @testset "no arguments" begin @test_throws ArgumentError checknoargs(:(something)); # Error: length(args) != 0 @@ -232,4 +227,6 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t @reset_parallel_kernel() end; end; -)) end == nothing || true; +)) + +end end == nothing || true; diff --git a/test/ParallelKernel/test_parallel.jl b/test/ParallelKernel/test_parallel.jl index 5965c791..8a4d4538 100644 --- a/test/ParallelKernel/test_parallel.jl +++ b/test/ParallelKernel/test_parallel.jl @@ -28,16 +28,17 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t macro compute(A) esc(:($(INDICES[1]) + ($(INDICES[2])-1)*size($A,1))) end macro compute_with_aliases(A) esc(:(ix + (iz -1)*size($A,1))) end import Enzyme -@static for package in TEST_PACKAGES eval(:( - @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin + +const TEST_PRECISIONS = [Float32, Float64] +for package in TEST_PACKAGES +for precision in TEST_PRECISIONS +(package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 + +eval(:( + @testset "$(basename(@__FILE__)) (package: $(nameof($package))) (precision: $(nameof($precision)))" begin @testset "1. parallel macros" begin @require !@is_initialized() - # @static if $package == $PKG_METAL - # @init_parallel_kernel($package, Float32) - # else - # @init_parallel_kernel($package, Float64) - # end - @init_parallel_kernel($package, Float32) + @init_parallel_kernel($package, $precision) @require @is_initialized() @testset "@parallel" begin @static if $package == $PKG_CUDA @@ -112,8 +113,8 @@ import Enzyme B̄ = @ones(N) A_ref = Array(A) B_ref = Array(B) - Ā_ref = ones(Float32, N) - B̄_ref = ones(Float32, N) + Ā_ref = ones($precision, N) + B̄_ref = ones($precision, N) @parallel_indices (ix) function f!(A, B, a) A[ix] += a * B[ix] * 100.65 return @@ -422,21 +423,21 @@ import Enzyme @reset_parallel_kernel() end; @testset "2. parallel macros (literal conversion)" begin - # @testset "@parallel_indices (Float64)" begin - # @require !@is_initialized() - # @static if $package == $PKG_METAL - # return - # end - # @require @is_initialized() - # expansion = @gorgeousstring(@parallel_indices (ix) f!(A) = (A[ix] = A[ix] + 1.0f0; return)) - # @test occursin("A[ix] = A[ix] + 1.0\n", expansion) - # @reset_parallel_kernel() - # end; + if $package != $PKG_METAL + @testset "@parallel_indices (Float64)" begin + @require !@is_initialized() + @init_parallel_kernel($package, Float64) + @require @is_initialized() + expansion = @gorgeousstring(@parallel_indices (ix) f!(A) = (A[ix] = A[ix] + 1.0; return)) + @test occursin("A[ix] = A[ix] + 1.0\n", expansion) + @reset_parallel_kernel() + end; + end @testset "@parallel_indices (Float32)" begin @require !@is_initialized() @init_parallel_kernel($package, Float32) @require @is_initialized() - expansion = @gorgeousstring(@parallel_indices (ix) f!(A) = (A[ix] = A[ix] + 1.0; return)) + expansion = @gorgeousstring(@parallel_indices (ix) f!(A) = (A[ix] = A[ix] + 1.0f0; return)) @test occursin("A[ix] = A[ix] + 1.0f0\n", expansion) @reset_parallel_kernel() end; @@ -448,14 +449,16 @@ import Enzyme @test occursin("A[ix] = A[ix] + Float16(1.0)\n", expansion) @reset_parallel_kernel() end; - @testset "@parallel_indices (ComplexF64)" begin - @require !@is_initialized() - @init_parallel_kernel($package, ComplexF64) - @require @is_initialized() - expansion = @gorgeousstring(@parallel_indices (ix) f!(A) = (A[ix] = 2.0f0 - 1.0f0im - A[ix] + 1.0f0; return)) - @test occursin("A[ix] = ((2.0 - 1.0im) - A[ix]) + 1.0\n", expansion) - @reset_parallel_kernel() - end; + if $package != $PKG_METAL + @testset "@parallel_indices (ComplexF64)" begin + @require !@is_initialized() + @init_parallel_kernel($package, ComplexF64) + @require @is_initialized() + expansion = @gorgeousstring(@parallel_indices (ix) f!(A) = (A[ix] = 2.0f0 - 1.0f0im - A[ix] + 1.0f0; return)) + @test occursin("A[ix] = ((2.0 - 1.0im) - A[ix]) + 1.0\n", expansion) + @reset_parallel_kernel() + end; + end @testset "@parallel_indices (ComplexF32)" begin @require !@is_initialized() @init_parallel_kernel($package, ComplexF32) @@ -476,12 +479,7 @@ import Enzyme @testset "3. global defaults" begin @testset "inbounds=true" begin @require !@is_initialized() - # @static if $package == $PKG_METAL - # @init_parallel_kernel($package, Float32, inbounds=true) - # else - # @init_parallel_kernel($package, Float64, inbounds=true) - # end - @init_parallel_kernel($package, Float32, inbounds=true) + @init_parallel_kernel($package, $precision, inbounds=true) @require @is_initialized expansion = @prettystring(1, @parallel_indices (ix) inbounds=true f(A) = (2*A; return)) @test occursin("Base.@inbounds begin", expansion) @@ -518,12 +516,7 @@ import Enzyme end; @testset "5. Exceptions" begin @require !@is_initialized() - # @static if $package == $PKG_METAL - # @init_parallel_kernel($package, Float32) - # else - # @init_parallel_kernel($package, Float64) - # end - @init_parallel_kernel($package, Float32) + @init_parallel_kernel($package, $precision) @require @is_initialized @testset "arguments @parallel" begin @test_throws ArgumentError checkargs_parallel(); # Error: isempty(args) @@ -558,4 +551,6 @@ import Enzyme @reset_parallel_kernel() end; end; -)) end == nothing || true; +)) + +end end == nothing || true; diff --git a/test/test_FiniteDifferences1D.jl b/test/test_FiniteDifferences1D.jl index 63934e13..97753b2f 100644 --- a/test/test_FiniteDifferences1D.jl +++ b/test/test_FiniteDifferences1D.jl @@ -21,15 +21,15 @@ end end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. -@static for package in TEST_PACKAGES eval(:( - @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin +const TEST_PRECISIONS = [Float32, Float64] +for package in TEST_PACKAGES +for precision in TEST_PRECISIONS +(package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 + +eval(:( + @testset "$(basename(@__FILE__)) (package: $(nameof($package))) (precision: $(nameof($precision)))" begin @require !@is_initialized() - # @static if $package == $PKG_METAL - # @init_parallel_stencil($package, Float32, 1) - # else - # @init_parallel_stencil($package, Float64, 1) - # end - @init_parallel_stencil($package, Float32, 1) + @init_parallel_stencil($package, $precision, 1) @require @is_initialized() nx = 7 A = @rand(nx ); @@ -83,4 +83,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t end; @reset_parallel_stencil() end; -)) end == nothing || true; +)) + +end +end == nothing || true; diff --git a/test/test_FiniteDifferences2D.jl b/test/test_FiniteDifferences2D.jl index 4b094e2e..73dd0aea 100644 --- a/test/test_FiniteDifferences2D.jl +++ b/test/test_FiniteDifferences2D.jl @@ -21,10 +21,15 @@ end end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. -@static for package in TEST_PACKAGES eval(:( - @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin +const TEST_PRECISIONS = [Float32, Float64] +for package in TEST_PACKAGES +for precision in TEST_PRECISIONS +(package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 + +eval(:( + @testset "$(basename(@__FILE__)) (package: $(nameof($package))) (precision: $(nameof($precision)))" begin @require !@is_initialized() - @init_parallel_stencil($package, Float32, 2) + @init_parallel_stencil($package, $precision, 2) @require @is_initialized() nx, ny = 7, 5 A = @rand(nx, ny ); @@ -119,4 +124,6 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t end; @reset_parallel_stencil() end; -)) end == nothing || true; +)) + +end end == nothing || true; diff --git a/test/test_FiniteDifferences3D.jl b/test/test_FiniteDifferences3D.jl index 1ccdb7bb..844062f7 100644 --- a/test/test_FiniteDifferences3D.jl +++ b/test/test_FiniteDifferences3D.jl @@ -21,15 +21,15 @@ end end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. -@static for package in TEST_PACKAGES eval(:( - @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin +const TEST_PRECISIONS = [Float32, Float64] +for package in TEST_PACKAGES +for precision in TEST_PRECISIONS +(package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 + +eval(:( + @testset "$(basename(@__FILE__)) (package: $(nameof($package))) (precision: $(nameof($precision)))" begin @require !@is_initialized() - # @static if $package == $PKG_METAL - # @init_parallel_stencil($package, Float32, 3) - # else - # @init_parallel_stencil($package, Float64, 3) - # end - @init_parallel_stencil($package, Float32, 3) + @init_parallel_stencil($package, $precision, 3) @require @is_initialized() nx, ny, nz = 7, 5, 6 A = @rand(nx , ny , nz ); @@ -178,4 +178,6 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t end; @reset_parallel_stencil() end; -)) end == nothing || true; +)) + +end end == nothing || true; diff --git a/test/test_parallel.jl b/test/test_parallel.jl index 63e0372f..dd434d26 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -26,16 +26,16 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t import ParallelStencil.@gorgeousexpand -@static for package in TEST_PACKAGES eval(:( - @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin +const TEST_PRECISIONS = [Float32, Float64] +for package in TEST_PACKAGES +for precision in TEST_PRECISIONS +(package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 + +eval(:( + @testset "$(basename(@__FILE__)) (package: $(nameof($package))) (precision: $(nameof($precision)))" begin @testset "1. parallel macros" begin @require !@is_initialized() - # @static if $package == $PKG_METAL - # @init_parallel_stencil($package, Float32, 3) - # else - # @init_parallel_stencil($package, Float64, 3) - # end - @init_parallel_stencil($package, Float32, 3) + @init_parallel_stencil($package, $precision, 3) @require @is_initialized() @testset "@parallel " begin # NOTE: calls must go to ParallelStencil.ParallelKernel.parallel and must therefore give the same result as in ParallelKernel, except for memopt tests (tests copied 1-to-1 from there). @static if $package == $PKG_CUDA @@ -182,7 +182,7 @@ import ParallelStencil.@gorgeousexpand end @testset "@parallel (3D; on-the-fly)" begin nx, ny, nz = 32, 8, 8 - lam=dt=_dx=_dy=_dz = 1.0f0 + lam=dt=_dx=_dy=_dz = 1 T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -204,7 +204,7 @@ import ParallelStencil.@gorgeousexpand ); @test all(Array(T2) .≈ Array(T2_ref)) end - @static if $package in [$PKG_CUDA, $PKG_AMDGPU] + @static if $package in [$PKG_CUDA, $PKG_AMDGPU] # TODO add support for Metal @testset "@parallel memopt (nx, ny, nz = x .* threads)" begin # NOTE: the following does not work for some reason: (nx, ny, nz = ($nx, $ny, $nz))" for (nx, ny, nz) in ((32, 8, 9), (32, 8, 8), (31, 7, 9), (33, 9, 9), (33, 7, 8)) nx, ny, nz = 32, 8, 8 # threads = (8, 4, 1) @@ -239,12 +239,12 @@ import ParallelStencil.@gorgeousexpand copy!(A, [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)].^3); @parallel_indices (ix,iy,iz) memopt=true loopsize=3 function d2_memopt!(A2, A) if (iz>1 && iz (3D, memopt, stencilranges=(0:0, -1:1, 0:0); y-stencil)" begin @@ -254,12 +254,12 @@ import ParallelStencil.@gorgeousexpand copy!(A, [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)].^3); @parallel_indices (ix,iy,iz) memopt=true function d2_memopt!(A2, A) if (iy>1 && iy (3D, memopt, stencilranges=(1:1, 1:1, 0:2); z-stencil)" begin @@ -272,7 +272,7 @@ import ParallelStencil.@gorgeousexpand return end @parallel memopt=true d2_memopt!(A2, A); - A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,2:end-1,3:end] .- 2.0.*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,2:end-1,1:end-2]; + A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,2:end-1,3:end] .- 2*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,2:end-1,1:end-2]; @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=(1:1, 0:2, 1:1); y-stencil)" begin @@ -285,11 +285,11 @@ import ParallelStencil.@gorgeousexpand return end @parallel memopt=true d2_memopt!(A2, A); - A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,3:end,2:end-1] .- 2.0.*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,1:end-2,2:end-1]; + A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,3:end,2:end-1] .- 2*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,1:end-2,2:end-1]; @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=-1:1)" begin - lam=dt=_dx=_dy=_dz = 1.0 + lam=dt=_dx=_dy=_dz = 1 T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -314,7 +314,7 @@ import ParallelStencil.@gorgeousexpand @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:2)" begin - lam=dt=_dx=_dy=_dz = 1.0 + lam=dt=_dx=_dy=_dz = 1 T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -339,22 +339,22 @@ import ParallelStencil.@gorgeousexpand copy!(A, [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)].^3); @parallel_indices (ix,iy,iz) memopt=true loopsize=3 function higher_order_memopt!(A2, A) if (ix-4>1 && ix-11 && iy+2<=size(A2,2) && iz-2>=1 && iz+3<=size(A2,3)) - A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2.0*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] + A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] end return end @parallel memopt=true higher_order_memopt!(A2, A); - A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; + A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:2; on-the-fly)" begin - lam=dt=_dx=_dy=_dz = 1.0 + lam=dt=_dx=_dy=_dz = 1 T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); Ci = @ones(nx, ny, nz); copy!(T, [ix + (iy-1)*size(T,1) + (iz-1)*size(T,1)*size(T,2) for ix=1:size(T,1), iy=1:size(T,2), iz=1:size(T,3)].^3); - @parallel memopt=true loopsize=3 function diffusion3D_step!(T2, T, Ci, lam::Data.Number, dt::Float64, _dx, _dy, _dz) + @parallel memopt=true loopsize=3 function diffusion3D_step!(T2, T, Ci, lam, dt, _dx, _dy, _dz) @all(qx) = -lam*@d_xi(T)*_dx # Fourier's law of heat conduction @all(qy) = -lam*@d_yi(T)*_dy # ... @all(qz) = -lam*@d_zi(T)*_dz # ... @@ -392,12 +392,12 @@ import ParallelStencil.@gorgeousexpand copy!(B, 2 .* [ix + (iy-1)*size(B,1) + (iz-1)*size(B,1)*size(B,2) for ix=1:size(B,1), iy=1:size(B,2), iz=1:size(B,3)].^3); @parallel_indices (ix,iy,iz) memopt=true loopsize=3 function d2_memopt!(A2, A, B) if (iz>1 && iz (3D, memopt; 2 arrays, y-stencil)" begin @@ -409,12 +409,12 @@ import ParallelStencil.@gorgeousexpand copy!(B, 2 .* [ix + (iy-1)*size(B,1) + (iz-1)*size(B,1)*size(B,2) for ix=1:size(B,1), iy=1:size(B,2), iz=1:size(B,3)].^3); @parallel_indices (ix,iy,iz) memopt=true loopsize=3 function d2_memopt!(A2, A, B) if (iy>1 && iy (3D, memopt; 2 arrays, x-stencil)" begin @@ -426,16 +426,16 @@ import ParallelStencil.@gorgeousexpand copy!(B, 2 .* [ix + (iy-1)*size(B,1) + (iz-1)*size(B,1)*size(B,2) for ix=1:size(B,1), iy=1:size(B,2), iz=1:size(B,3)].^3); @parallel_indices (ix,iy,iz) memopt=true function d2_memopt!(A2, A, B) if (ix>1 && ix (3D, memopt; 2 arrays, x-y-z- + z-stencil)" begin - lam=dt=_dx=_dy=_dz = 1.0 + lam=dt=_dx=_dy=_dz = 1 T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -455,7 +455,7 @@ import ParallelStencil.@gorgeousexpand @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 2 arrays, x-y-z- + x-stencil)" begin - lam=dt=_dx=_dy=_dz = 1.0 + lam=dt=_dx=_dy=_dz = 1 T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -475,7 +475,7 @@ import ParallelStencil.@gorgeousexpand @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin - lam=dt=_dx=_dy=_dz = 1.0 + lam=dt=_dx=_dy=_dz = 1 T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -511,20 +511,20 @@ import ParallelStencil.@gorgeousexpand copy!(C, 3 .* [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)].^3); @parallel_indices (ix,iy,iz) memopt=true loopsize=3 function higher_order_memopt!(A2, B2, C2, A, B, C) if (ix-4>1 && ix-11 && iy+2<=size(A2,2) && iz-2>=1 && iz+3<=size(A2,3)) - A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2.0*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] + A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] end if (ix-4>1 && ix-11 && iy+2<=size(B2,2) && iz-2>=1 && iz+3<=size(B2,3)) - B2[ix-1,iy+2,iz] = B[ix-1,iy+2,iz+3] - 2.0*B[ix-3,iy+2,iz] + B[ix-4,iy+2,iz-2] + B2[ix-1,iy+2,iz] = B[ix-1,iy+2,iz+3] - 2*B[ix-3,iy+2,iz] + B[ix-4,iy+2,iz-2] end if (ix-4>1 && ix-11 && iy+2<=size(C2,2) && iz-2>=1 && iz+3<=size(C2,3)) - C2[ix-1,iy+2,iz] = C[ix-1,iy+2,iz+3] - 2.0*C[ix-3,iy+2,iz] + C[ix-4,iy+2,iz-2] + C2[ix-1,iy+2,iz] = C[ix-1,iy+2,iz+3] - 2*C[ix-3,iy+2,iz] + C[ix-4,iy+2,iz-2] end return end @parallel memopt=true higher_order_memopt!(A2, B2, C2, A, B, C); - A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; - B2_ref[5:end-1,3:end,3:end-3] .= B[5:end-1,3:end,6:end] .- 2.0.*B[3:end-3,3:end,3:end-3] .+ B[2:end-4,3:end,1:end-5]; - C2_ref[5:end-1,3:end,3:end-3] .= C[5:end-1,3:end,6:end] .- 2.0.*C[3:end-3,3:end,3:end-3] .+ C[2:end-4,3:end,1:end-5]; + A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; + B2_ref[5:end-1,3:end,3:end-3] .= B[5:end-1,3:end,6:end] .- 2*B[3:end-3,3:end,3:end-3] .+ B[2:end-4,3:end,1:end-5]; + C2_ref[5:end-1,3:end,3:end-3] .= C[5:end-1,3:end,6:end] .- 2*C[3:end-3,3:end,3:end-3] .+ C[2:end-4,3:end,1:end-5]; @test all(Array(A2) .≈ Array(A2_ref)) @test all(Array(B2) .≈ Array(B2_ref)) @test all(Array(C2) .≈ Array(C2_ref)) @@ -544,20 +544,20 @@ import ParallelStencil.@gorgeousexpand copy!(C, 3 .* [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)].^3); @parallel_indices (ix,iy,iz) memopt=true loopsize=3 function higher_order_memopt!(A2, B2, C2, A, B, C) if (ix-4>1 && ix-11 && iy+2<=size(A2,2) && iz-2>=1 && iz+3<=size(A2,3)) - A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2.0*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] + A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] end if (ix-4>1 && ix-11 && iy+2<=size(B2,2) && iz+1>=1 && iz+2<=size(B2,3)) - B2[ix-1,iy+2,iz+1] = B[ix-1,iy+2,iz+2] - 2.0*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] + B2[ix-1,iy+2,iz+1] = B[ix-1,iy+2,iz+2] - 2*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] end if (ix-4>1 && ix-11 && iy+2<=size(C2,2) && iz-1>=1 && iz<=size(C2,3)) - C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2.0*C[ix-3,iy+2,iz-1] + C[ix-4,iy+2,iz-1] + C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2*C[ix-3,iy+2,iz-1] + C[ix-4,iy+2,iz-1] end return end @parallel memopt=true higher_order_memopt!(A2, B2, C2, A, B, C); - A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; - B2_ref[5:end-1,3:end,2:end-1] .= B[5:end-1,3:end,3:end] .- 2.0.*B[3:end-3,3:end,2:end-1] .+ B[2:end-4,3:end,2:end-1]; - C2_ref[5:end-1,3:end,1:end-1] .= C[5:end-1,3:end,2:end] .- 2.0.*C[3:end-3,3:end,1:end-1] .+ C[2:end-4,3:end,1:end-1]; + A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; + B2_ref[5:end-1,3:end,2:end-1] .= B[5:end-1,3:end,3:end] .- 2*B[3:end-3,3:end,2:end-1] .+ B[2:end-4,3:end,2:end-1]; + C2_ref[5:end-1,3:end,1:end-1] .= C[5:end-1,3:end,2:end] .- 2*C[3:end-3,3:end,1:end-1] .+ C[2:end-4,3:end,1:end-1]; @test all(Array(A2) .≈ Array(A2_ref)) @test all(Array(B2) .≈ Array(B2_ref)) @test all(Array(C2) .≈ Array(C2_ref)) @@ -577,20 +577,20 @@ import ParallelStencil.@gorgeousexpand copy!(C, 3 .* [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)].^3); @parallel_indices (ix,iy,iz) memopt=true loopsize=3 function higher_order_memopt!(A2, B2, C2, A, B, C) if (ix-4>1 && ix-11 && iy+2<=size(A2,2) && iz-2>=1 && iz+3<=size(A2,3)) - A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2.0*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] + A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] end if (ix-4>1 && ix+11 && iy+2<=size(B2,2) && iz+1>=1 && iz+2<=size(B2,3)) - B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2.0*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] + B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] end if (ix-1>1 && ix-11 && iy+2<=size(C2,2) && iz-1>=1 && iz<=size(C2,3)) - C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2.0*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] + C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] end return end @parallel memopt=true higher_order_memopt!(A2, B2, C2, A, B, C); - A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; - B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2.0.*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; - C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2.0.*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; + A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; + B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; + C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; @test all(Array(A2) .≈ Array(A2_ref)) @test all(Array(B2) .≈ Array(B2_ref)) @test all(Array(C2) .≈ Array(C2_ref)) @@ -610,13 +610,13 @@ import ParallelStencil.@gorgeousexpand copy!(C, 3 .* [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)].^3); kernel = @gorgeousstring @parallel_indices (ix,iy,iz) memopt=true optvars=(A, C) loopdim=3 loopsize=3 optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)) function higher_order_memopt!(A2, B2, C2, A, B, C) if (ix-4>1 && ix-11 && iy+2<=size(A2,2) && iz-2>=1 && iz+3<=size(A2,3)) - A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2.0*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] + A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] end if (ix-4>1 && ix+11 && iy+2<=size(B2,2) && iz+1>=1 && iz+2<=size(B2,3)) - B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2.0*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] + B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] end if (ix-1>1 && ix-11 && iy+2<=size(C2,2) && iz-1>=1 && iz<=size(C2,3)) - C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2.0*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] + C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] end return end @@ -624,28 +624,30 @@ import ParallelStencil.@gorgeousexpand @test occursin("loopoffset = ((CUDA.blockIdx()).z - 1) * 3", kernel) elseif $package == $PKG_AMDGPU @test occursin("loopoffset = ((AMDGPU.workgroupIdx()).z - 1) * 3", kernel) + elseif $package == $PKG_METAL + @test occursin("loopoffset = ((Metal.threadgroup_position_in_grid_3d()).z - 1) * 3", kernel) end @test occursin("for i = -4:3", kernel) @test occursin("tz = i + loopoffset", kernel) - @test occursin("A2[ix - 1, iy + 2, iz] = (A_ixm1_iyp2_izp3 - 2.0A_ixm3_iyp2_iz) + A_ixm4_iyp2_izm2", kernel) - @test occursin("B2[ix + 1, iy + 2, iz + 1] = (B[ix + 1, iy + 2, iz + 2] - 2.0 * B[ix - 3, iy + 2, iz + 1]) + B[ix - 4, iy + 2, iz + 1]", kernel) - @test occursin("C2[ix - 1, iy + 2, iz - 1] = (C_ixm1_iyp2_iz - 2.0C_ixm1_iyp2_izm1) + C_ixm1_iyp2_izm1", kernel) + @test occursin("A2[ix - 1, iy + 2, iz] = (A_ixm1_iyp2_izp3 - 2A_ixm3_iyp2_iz) + A_ixm4_iyp2_izm2", kernel) + @test occursin("B2[ix + 1, iy + 2, iz + 1] = (B[ix + 1, iy + 2, iz + 2] - 2 * B[ix - 3, iy + 2, iz + 1]) + B[ix - 4, iy + 2, iz + 1]", kernel) + @test occursin("C2[ix - 1, iy + 2, iz - 1] = (C_ixm1_iyp2_iz - 2C_ixm1_iyp2_izm1) + C_ixm1_iyp2_izm1", kernel) @parallel_indices (ix,iy,iz) memopt=true optvars=(A, C) loopdim=3 loopsize=3 optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)) function higher_order_memopt!(A2, B2, C2, A, B, C) if (ix-4>1 && ix-11 && iy+2<=size(A2,2) && iz-2>=1 && iz+3<=size(A2,3)) - A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2.0*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] + A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] end if (ix-4>1 && ix+11 && iy+2<=size(B2,2) && iz+1>=1 && iz+2<=size(B2,3)) - B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2.0*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] + B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] end if (ix-1>1 && ix-11 && iy+2<=size(C2,2) && iz-1>=1 && iz<=size(C2,3)) - C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2.0*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] + C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] end return end @parallel memopt=true higher_order_memopt!(A2, B2, C2, A, B, C); - A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; - B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2.0.*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; - C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2.0.*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; + A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; + B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; + C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; @test all(Array(A2) .≈ Array(A2_ref)) @test all(Array(B2) .≈ Array(B2_ref)) @test all(Array(C2) .≈ Array(C2_ref)) @@ -665,13 +667,13 @@ import ParallelStencil.@gorgeousexpand copy!(C, 3 .* [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)].^3); kernel = @gorgeousstring @parallel_indices (ix,iy,iz) memopt=true optvars=(A, C) loopdim=3 loopsize=3 optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)) function higher_order_memopt!(A2, B2, C2, A, B, C) if (ix-4>1 && ix-11 && iy+2<=size(A2,2) && iz-2>=1 && iz+3<=size(A2,3)) - A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2.0*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] + A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] end if (ix-4>1 && ix+11 && iy+2<=size(B2,2) && iz+1>=1 && iz+2<=size(B2,3)) - B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2.0*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] + B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] end if (ix-1>1 && ix-11 && iy+2<=size(C2,2) && iz-1>=1 && iz<=size(C2,3)) - C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2.0*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] + C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] end return end @@ -679,28 +681,30 @@ import ParallelStencil.@gorgeousexpand @test occursin("loopoffset = ((CUDA.blockIdx()).z - 1) * 3", kernel) elseif $package == $PKG_AMDGPU @test occursin("loopoffset = ((AMDGPU.workgroupIdx()).z - 1) * 3", kernel) + elseif $package == $PKG_METAL + @test occursin("loopoffset = ((Metal.threadgroup_position_in_grid_3d()).z - 1) * 3", kernel) end @test occursin("for i = -4:3", kernel) @test occursin("tz = i + loopoffset", kernel) - @test occursin("A2[ix - 1, iy + 2, iz] = (A_ixm1_iyp2_izp3 - 2.0A_ixm3_iyp2_iz) + A_ixm4_iyp2_izm2", kernel) - @test occursin("B2[ix + 1, iy + 2, iz + 1] = (B[ix + 1, iy + 2, iz + 2] - 2.0 * B[ix - 3, iy + 2, iz + 1]) + B[ix - 4, iy + 2, iz + 1]", kernel) - @test occursin("C2[ix - 1, iy + 2, iz - 1] = (C_ixm1_iyp2_iz - 2.0C_ixm1_iyp2_izm1) + C_ixm1_iyp2_izm1", kernel) + @test occursin("A2[ix - 1, iy + 2, iz] = (A_ixm1_iyp2_izp3 - 2A_ixm3_iyp2_iz) + A_ixm4_iyp2_izm2", kernel) + @test occursin("B2[ix + 1, iy + 2, iz + 1] = (B[ix + 1, iy + 2, iz + 2] - 2 * B[ix - 3, iy + 2, iz + 1]) + B[ix - 4, iy + 2, iz + 1]", kernel) + @test occursin("C2[ix - 1, iy + 2, iz - 1] = (C_ixm1_iyp2_iz - 2C_ixm1_iyp2_izm1) + C_ixm1_iyp2_izm1", kernel) @parallel_indices (ix,iy,iz) memopt=true optvars=(A, C) loopdim=3 loopsize=3 optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)) function higher_order_memopt!(A2, B2, C2, A, B, C) if (ix-4>1 && ix-11 && iy+2<=size(A2,2) && iz-2>=1 && iz+3<=size(A2,3)) - A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2.0*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] + A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] end if (ix-4>1 && ix+11 && iy+2<=size(B2,2) && iz+1>=1 && iz+2<=size(B2,3)) - B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2.0*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] + B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] end if (ix-1>1 && ix-11 && iy+2<=size(C2,2) && iz-1>=1 && iz<=size(C2,3)) - C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2.0*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] + C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] end return end @parallel memopt=true higher_order_memopt!(A2, B2, C2, A, B, C); - A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; - B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2.0.*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; - C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2.0.*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; + A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; + B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; + C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; @test all(Array(A2) .≈ Array(A2_ref)) @test all(Array(B2) .≈ Array(B2_ref)) @test all(Array(C2) .≈ Array(C2_ref)) @@ -720,35 +724,35 @@ import ParallelStencil.@gorgeousexpand copy!(C, 3 .* [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)].^3); kernel = @gorgeousstring @parallel_indices (ix,iy,iz) memopt=true optvars=(A, B) loopdim=3 loopsize=3 optranges=(A=(-1:-1, 2:2, -2:3), B=(-4:-3, 2:2, 1:1)) function higher_order_memopt!(A2, B2, C2, A, B, C) if (ix-4>1 && ix-11 && iy+2<=size(A2,2) && iz-2>=1 && iz+3<=size(A2,3)) - A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2.0*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] + A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] end if (ix-4>1 && ix+11 && iy+2<=size(B2,2) && iz+1>=1 && iz+2<=size(B2,3)) - B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2.0*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] + B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] end if (ix-1>1 && ix-11 && iy+2<=size(C2,2) && iz-1>=1 && iz<=size(C2,3)) - C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2.0*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] + C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] end return end - @test occursin("A2[ix - 1, iy + 2, iz] = (A_ixm1_iyp2_izp3 - 2.0 * A[ix - 3, iy + 2, iz]) + A[ix - 4, iy + 2, iz - 2]", kernel) - @test occursin("B2[ix + 1, iy + 2, iz + 1] = (B[ix + 1, iy + 2, iz + 2] - 2.0B_ixm3_iyp2_izp1) + B_ixm4_iyp2_izp1", kernel) # NOTE: when z is restricted to 1:1 then x cannot include +1, as else the x-y range does not include any z (result: IncoherentArgumentError: incoherent argument in memopt: optranges in z dimension do not include any array access.). - @test occursin("C2[ix - 1, iy + 2, iz - 1] = (C[ix - 1, iy + 2, iz] - 2.0 * C[ix - 1, iy + 2, iz - 1]) + C[ix - 1, iy + 2, iz - 1]", kernel) + @test occursin("A2[ix - 1, iy + 2, iz] = (A_ixm1_iyp2_izp3 - 2 * A[ix - 3, iy + 2, iz]) + A[ix - 4, iy + 2, iz - 2]", kernel) + @test occursin("B2[ix + 1, iy + 2, iz + 1] = (B[ix + 1, iy + 2, iz + 2] - 2B_ixm3_iyp2_izp1) + B_ixm4_iyp2_izp1", kernel) # NOTE: when z is restricted to 1:1 then x cannot include +1, as else the x-y range does not include any z (result: IncoherentArgumentError: incoherent argument in memopt: optranges in z dimension do not include any array access.). + @test occursin("C2[ix - 1, iy + 2, iz - 1] = (C[ix - 1, iy + 2, iz] - 2 * C[ix - 1, iy + 2, iz - 1]) + C[ix - 1, iy + 2, iz - 1]", kernel) @parallel_indices (ix,iy,iz) memopt=true optvars=(A, B) loopdim=3 loopsize=3 optranges=(A=(-1:-1, 2:2, -2:3), B=(-4:-3, 2:2, 1:1)) function higher_order_memopt!(A2, B2, C2, A, B, C) if (ix-4>1 && ix-11 && iy+2<=size(A2,2) && iz-2>=1 && iz+3<=size(A2,3)) - A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2.0*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] + A2[ix-1,iy+2,iz] = A[ix-1,iy+2,iz+3] - 2*A[ix-3,iy+2,iz] + A[ix-4,iy+2,iz-2] end if (ix-4>1 && ix+11 && iy+2<=size(B2,2) && iz+1>=1 && iz+2<=size(B2,3)) - B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2.0*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] + B2[ix+1,iy+2,iz+1] = B[ix+1,iy+2,iz+2] - 2*B[ix-3,iy+2,iz+1] + B[ix-4,iy+2,iz+1] end if (ix-1>1 && ix-11 && iy+2<=size(C2,2) && iz-1>=1 && iz<=size(C2,3)) - C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2.0*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] + C2[ix-1,iy+2,iz-1] = C[ix-1,iy+2,iz] - 2*C[ix-1,iy+2,iz-1] + C[ix-1,iy+2,iz-1] end return end @parallel memopt=true higher_order_memopt!(A2, B2, C2, A, B, C); - A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2.0.*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; - B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2.0.*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; - C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2.0.*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; + A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; + B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; + C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; @test all(Array(A2) .≈ Array(A2_ref)) @test all(Array(B2) .≈ Array(B2_ref)) @test all(Array(C2) .≈ Array(C2_ref)) @@ -782,7 +786,7 @@ import ParallelStencil.@gorgeousexpand @test all(Array(A2) .≈ Array(A)) end @testset "@parallel (3D, memopt, stencilranges=0:2)" begin - lam=dt=_dx=_dy=_dz = 1.0 + lam=dt=_dx=_dy=_dz = 1 T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -801,7 +805,7 @@ import ParallelStencil.@gorgeousexpand @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin - lam=dt=_dx=_dy=_dz = 1.0 + lam=dt=_dx=_dy=_dz = 1 T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -832,19 +836,14 @@ import ParallelStencil.@gorgeousexpand end; @reset_parallel_stencil() end; - @testset "2. parallel macros (2D)" begin + @testset "2 parallel macros (2D)" begin @require !@is_initialized() - # @static if $package == $PKG_METAL - # @init_parallel_stencil($package, Float32, 2) - # else - # @init_parallel_stencil($package, Float64, 2) - # end - @init_parallel_stencil($package, Float32, 2) + @init_parallel_stencil($package, $precision, 2) @require @is_initialized() - @static if $package in [$PKG_CUDA, $PKG_AMDGPU] + @static if $package in [$PKG_CUDA, $PKG_AMDGPU] # TODO add support for Metal nx, ny, nz = 32, 8, 1 @testset "@parallel_indices (2D, memopt, stencilranges=(-1:1,-1:1,0:0))" begin - lam=dt=_dx=_dy = 1.0 + lam=dt=_dx=_dy = 1 T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -872,12 +871,7 @@ import ParallelStencil.@gorgeousexpand @testset "3. global defaults" begin @testset "inbounds=true" begin @require !@is_initialized() - # @static if $package == $PKG_METAL - # @init_parallel_stencil($package, Float32, 1, inbounds=true) - # else - # @init_parallel_stencil($package, Float64, 1, inbounds=true) - # end - @init_parallel_stencil($package, Float32, 1, inbounds=true) + @init_parallel_stencil($package, $precision, 1, inbounds=true) @require @is_initialized expansion = @prettystring(1, @parallel_indices (ix) inbounds=true f(A) = (2*A; return)) @test occursin("Base.@inbounds begin", expansion) @@ -889,50 +883,43 @@ import ParallelStencil.@gorgeousexpand end; @testset "@parallel_indices (I...) (1D)" begin @require !@is_initialized() - @init_parallel_stencil($package, Float32, 1) + @init_parallel_stencil($package, $precision, 1) @require @is_initialized A = @zeros(4*5*6) - @parallel_indices (I...) function write_indices!(A) - A[I...] = sum((I .- (1,)) .* (1.0)); + one = 1 + @parallel_indices (I...) function write_indices!(A, one) + A[I...] = sum((I .- (1,)) .* (one)); return end - @parallel write_indices!(A); + @parallel write_indices!(A, one); @test all(Array(A) .≈ [(ix-1) for ix=1:size(A,1)]) @reset_parallel_stencil() end; @testset "@parallel_indices (I...) (2D)" begin @require !@is_initialized() - # @static if $package == $PKG_METAL - # @init_parallel_stencil($package, Float32, 2) - # else - # @init_parallel_stencil($package, Float64, 2) - # end - @init_parallel_stencil($package, Float32, 2) + @init_parallel_stencil($package, $precision, 2) @require @is_initialized A = @zeros(4, 5*6) - @parallel_indices (I...) function write_indices!(A) - A[I...] = sum((I .- (1,)) .* (1.0, size(A,1))); + one = 1 + @parallel_indices (I...) function write_indices!(A, one) + A[I...] = sum((I .- (1,)) .* (one, size(A,1))); return end - @parallel write_indices!(A); + @parallel write_indices!(A, one); @test all(Array(A) .≈ [(ix-1) + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) @reset_parallel_stencil() end; @testset "@parallel_indices (I...) (3D)" begin @require !@is_initialized() - # @static if $package == $PKG_METAL - # @init_parallel_stencil($package, Float32, 3) - # else - # @init_parallel_stencil($package, Float64, 3) - # end - @init_parallel_stencil($package, Float32, 3) + @init_parallel_stencil($package, $precision, 3) @require @is_initialized A = @zeros(4, 5, 6) - @parallel_indices (I...) function write_indices!(A) - A[I...] = sum((I .- (1,)) .* (1.0, size(A,1), size(A,1)*size(A,2))); + one = 1 + @parallel_indices (I...) function write_indices!(A, one) + A[I...] = sum((I .- (1,)) .* (one, size(A,1), size(A,1)*size(A,2))); return end - @parallel write_indices!(A); + @parallel write_indices!(A, one); @test all(Array(A) .≈ [(ix-1) + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) @reset_parallel_stencil() end; @@ -1032,12 +1019,7 @@ import ParallelStencil.@gorgeousexpand @reset_parallel_stencil() end; @testset "5. Exceptions" begin - # @static if $package == $PKG_METAL - # @init_parallel_stencil($package, Float32, 3) - # else - # @init_parallel_stencil($package, Float64, 3) - # end - @init_parallel_stencil($package, Float32, 3) + @init_parallel_stencil($package, $precision, 3) @require @is_initialized @testset "arguments @parallel" begin @test_throws ArgumentError checkargs_parallel(); # Error: isempty(args) @@ -1054,4 +1036,6 @@ import ParallelStencil.@gorgeousexpand @reset_parallel_stencil() end; end; -)) end == nothing || true; +)) + +end end == nothing || true; From b2a419661dfc9095eb7396c6117f8d8b7d6fc613 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 9 Oct 2024 15:57:35 +0200 Subject: [PATCH 20/46] Add some documentation --- README.md | 5 +++-- src/ParallelKernel/Data.jl | 12 ++++++------ src/ParallelKernel/allocators.jl | 4 ++-- src/ParallelKernel/parallel.jl | 6 +++--- src/ParallelStencil.jl | 2 +- src/init_parallel_stencil.jl | 2 +- src/parallel.jl | 6 +++--- 7 files changed, 19 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 99979880..5d016440 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ ParallelStencil empowers domain scientists to write architecture-agnostic high-l ![Performance ParallelStencil Teff](docs/images/perf_ps2.png) -ParallelStencil relies on the native kernel programming capabilities of [CUDA.jl] and [AMDGPU.jl] and on [Base.Threads] for high-performance computations on GPUs and CPUs, respectively. It is seamlessly interoperable with [ImplicitGlobalGrid.jl], which renders the distributed parallelization of stencil-based GPU and CPU applications on a regular staggered grid almost trivial and enables close to ideal weak scaling of real-world applications on thousands of GPUs \[[1][JuliaCon20a], [2][JuliaCon20b], [3][JuliaCon19], [4][PASC19]\]. Moreover, ParallelStencil enables hiding communication behind computation with a simple macro call and without any particular restrictions on the package used for communication. ParallelStencil has been designed in conjunction with [ImplicitGlobalGrid.jl] for simplest possible usage by domain-scientists, rendering fast and interactive development of massively scalable high performance multi-GPU applications readily accessible to them. Furthermore, we have developed a self-contained approach for "Solving Nonlinear Multi-Physics on GPU Supercomputers with Julia" relying on ParallelStencil and [ImplicitGlobalGrid.jl] \[[1][JuliaCon20a]\]. ParallelStencil's feature to hide communication behind computation was showcased when a close to ideal weak scaling was demonstrated for a 3-D poro-hydro-mechanical real-world application on up to 1024 GPUs on the Piz Daint Supercomputer \[[1][JuliaCon20a]\]: +ParallelStencil relies on the native kernel programming capabilities of [CUDA.jl], [AMDGPU.jl], [Metal.jl] and on [Base.Threads] for high-performance computations on GPUs and CPUs, respectively. It is seamlessly interoperable with [ImplicitGlobalGrid.jl], which renders the distributed parallelization of stencil-based GPU and CPU applications on a regular staggered grid almost trivial and enables close to ideal weak scaling of real-world applications on thousands of GPUs \[[1][JuliaCon20a], [2][JuliaCon20b], [3][JuliaCon19], [4][PASC19]\]. Moreover, ParallelStencil enables hiding communication behind computation with a simple macro call and without any particular restrictions on the package used for communication. ParallelStencil has been designed in conjunction with [ImplicitGlobalGrid.jl] for simplest possible usage by domain-scientists, rendering fast and interactive development of massively scalable high performance multi-GPU applications readily accessible to them. Furthermore, we have developed a self-contained approach for "Solving Nonlinear Multi-Physics on GPU Supercomputers with Julia" relying on ParallelStencil and [ImplicitGlobalGrid.jl] \[[1][JuliaCon20a]\]. ParallelStencil's feature to hide communication behind computation was showcased when a close to ideal weak scaling was demonstrated for a 3-D poro-hydro-mechanical real-world application on up to 1024 GPUs on the Piz Daint Supercomputer \[[1][JuliaCon20a]\]: ![Parallel efficiency of ParallelStencil with CUDA C backend](docs/images/par_eff_c_julia2.png) @@ -32,7 +32,7 @@ Beyond traditional high-performance computing, ParallelStencil supports automati * [References](#references) ## Parallelization and optimization with one macro call -A simple call to `@parallel` is enough to parallelize and optimize a function and to launch it. The package used underneath for parallelization is defined in a call to `@init_parallel_stencil` beforehand. Supported are [CUDA.jl] and [AMDGPU.jl] for running on GPU and [Base.Threads] for CPU. The following example outlines how to run parallel computations on a GPU using the native kernel programming capabilities of [CUDA.jl] underneath (omitted lines are represented with `#(...)`, omitted arguments with `...`): +A simple call to `@parallel` is enough to parallelize and optimize a function and to launch it. The package used underneath for parallelization is defined in a call to `@init_parallel_stencil` beforehand. Supported are [CUDA.jl], [AMDGPU.jl] and [Metal.jl] for running on GPU and [Base.Threads] for CPU. The following example outlines how to run parallel computations on a GPU using the native kernel programming capabilities of [CUDA.jl] underneath (omitted lines are represented with `#(...)`, omitted arguments with `...`): ```julia #(...) @init_parallel_stencil(CUDA,...) @@ -553,6 +553,7 @@ Please open an issue to discuss your idea for a contribution beforehand. Further [CellArrays.jl]: https://github.com/omlins/CellArrays.jl [CUDA.jl]: https://github.com/JuliaGPU/CUDA.jl [AMDGPU.jl]: https://github.com/JuliaGPU/AMDGPU.jl +[Metal.jl]: https://github.com/JuliaGPU/Metal.jl [Enzyme.jl]: https://github.com/EnzymeAD/Enzyme.jl [MacroTools.jl]: https://github.com/FluxML/MacroTools.jl [StaticArrays.jl]: https://github.com/JuliaArrays/StaticArrays.jl diff --git a/src/ParallelKernel/Data.jl b/src/ParallelKernel/Data.jl index 16a72d50..5798ed9d 100644 --- a/src/ParallelKernel/Data.jl +++ b/src/ParallelKernel/Data.jl @@ -16,12 +16,12 @@ The type of indices used in parallel kernels. -------------------------------------------------------------------------------- Data.Array{ndims} -Expands to `Data.Array{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.Array` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (Array for Threads or Polyester, CUDA.CuArray or CUDA.CuDeviceArray for CUDA and AMDGPU.ROCArray or AMDGPU.ROCDeviceArray for AMDGPU; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray and AMDGPU.ROCArray automatically to CUDA.CuDeviceArray and AMDGPU.ROCDeviceArray in kernels when required). +Expands to `Data.Array{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.Array` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (Array for Threads or Polyester, CUDA.CuArray or CUDA.CuDeviceArray for CUDA, AMDGPU.ROCArray or AMDGPU.ROCDeviceArray for AMDGPU and Metal.MtlArray or Metal.MtlDeviceArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray, AMDGPU.ROCArray and Metal.MtlArray automatically to CUDA.CuDeviceArray, AMDGPU.ROCDeviceArray and Metal.MtlDeviceArray in kernels when required). -------------------------------------------------------------------------------- Data.CellArray{ndims} -Expands to `Data.CellArray{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.CellArray` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA and ROCCellArray or ROCDeviceCellArray for AMDGPU; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray when required). +Expands to `Data.CellArray{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.CellArray` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA, ROCCellArray or ROCDeviceCellArray for AMDGPU and MetalCellArray or MetalDeviceCellArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray when required). -------------------------------------------------------------------------------- Data.Cell{S} @@ -57,18 +57,18 @@ Expands to: `NTuple{N_tuple, Data.Cell{S}}` | `NamedTuple{names, NTuple{N_tuple, !!! note "Advanced" Data.DeviceArray{ndims} - Expands to `Data.DeviceArray{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.DeviceArray` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (Array for Threads or Polyester, CUDA.CuDeviceArray for CUDA AMDGPU.ROCDeviceArray for AMDGPU). + Expands to `Data.DeviceArray{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.DeviceArray` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (Array for Threads or Polyester, CUDA.CuDeviceArray for CUDA, AMDGPU.ROCDeviceArray for AMDGPU and Metal.MtlDeviceArray for Metal). !!! warning - This datatype is not intended for explicit manual usage. [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray and AMDGPU.ROCArray automatically to CUDA.CuDeviceArray and AMDGPU.ROCDeviceArray in kernels when required. + This datatype is not intended for explicit manual usage. [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray, AMDGPU.ROCArray and Metal.MtlArray automatically to CUDA.CuDeviceArray, AMDGPU.ROCDeviceArray and Metal.MtlDeviceArray in kernels when required. -------------------------------------------------------------------------------- Data.DeviceCellArray{ndims} - Expands to `Data.DeviceCellArray{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.DeviceCellArray` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuDeviceCellArray for CUDA and ROCDeviceCellArray for AMDGPU). + Expands to `Data.DeviceCellArray{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.DeviceCellArray` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuDeviceCellArray for CUDA, ROCDeviceCellArray for AMDGPU and MetalDeviceCellArray for Metal). !!! warning - This datatype is not intended for explicit manual usage. [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray and AMDGPU.ROCArray automatically to CUDA.CuDeviceArray and AMDGPU.ROCDeviceArray in kernels when required. + This datatype is not intended for explicit manual usage. [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray, AMDGPU.ROCArray and Metal.MtlArray automatically to CUDA.CuDeviceArray, AMDGPU.ROCDeviceArray and Metal.MtlDeviceArray in kernels when required. """ const DATA_DOC_NUMBERTYPE_NONE = """ diff --git a/src/ParallelKernel/allocators.jl b/src/ParallelKernel/allocators.jl index 0877126e..ca47db03 100644 --- a/src/ParallelKernel/allocators.jl +++ b/src/ParallelKernel/allocators.jl @@ -3,7 +3,7 @@ const ZEROS_DOC = """ @zeros(args...) @zeros(args..., ) -Call `zeros(eltype, args...)`, where `eltype` is by default the `numbertype` selected with [`@init_parallel_kernel`](@ref) and the function `zeros` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (zeros for Threads or Polyester, CUDA.zeros for CUDA and AMDGPU.zeros for AMDGPU). +Call `zeros(eltype, args...)`, where `eltype` is by default the `numbertype` selected with [`@init_parallel_kernel`](@ref) and the function `zeros` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (zeros for Threads or Polyester, CUDA.zeros for CUDA, AMDGPU.zeros for AMDGPU and Metal.zeros for Metal). !!! note "Advanced" The `eltype` can be explicitly passed as keyword argument in order to be used instead of the default `numbertype` chosen with [`@init_parallel_kernel`](@ref). If no default `numbertype` was chosen [`@init_parallel_kernel`](@ref), then the keyword argument `eltype` is mandatory. This needs to be used with care to ensure that no datatype conversions occur in performance critical computations. @@ -31,7 +31,7 @@ const ONES_DOC = """ @ones(args...) @ones(args..., ) -Call `ones(eltype, args...)`, where `eltype` is by default the `numbertype` selected with [`@init_parallel_kernel`](@ref) and the function `ones` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (ones for Threads or Polyester, CUDA.ones for CUDA and AMDGPU.ones for AMDGPU). +Call `ones(eltype, args...)`, where `eltype` is by default the `numbertype` selected with [`@init_parallel_kernel`](@ref) and the function `ones` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (ones for Threads or Polyester, CUDA.ones for CUDA, AMDGPU.ones for AMDGPU and Metal.ones for Metal). !!! note "Advanced" The `eltype` can be explicitly passed as keyword argument in order to be used instead of the default `numbertype` chosen with [`@init_parallel_kernel`](@ref). If no default `numbertype` was chosen [`@init_parallel_kernel`](@ref), then the keyword argument `eltype` is mandatory. This needs to be used with care to ensure that no datatype conversions occur in performance critical computations. diff --git a/src/ParallelKernel/parallel.jl b/src/ParallelKernel/parallel.jl index 334003c9..f13f9ad1 100644 --- a/src/ParallelKernel/parallel.jl +++ b/src/ParallelKernel/parallel.jl @@ -15,8 +15,8 @@ Declare the `kernelcall` parallel. The kernel will automatically be called as re - `kernelcall`: a call to a kernel that is declared parallel. !!! note "Advanced optional arguments" - `ranges::Tuple{UnitRange{},UnitRange{},UnitRange{}} | Tuple{UnitRange{},UnitRange{}} | Tuple{UnitRange{}} | UnitRange{}`: the ranges of indices in each dimension for which computations must be performed. - - `nblocks::Tuple{Integer,Integer,Integer}`: the number of blocks to be used if the package CUDA or AMDGPU was selected with [`@init_parallel_kernel`](@ref). - - `nthreads::Tuple{Integer,Integer,Integer}`: the number of threads to be used if the package CUDA or AMDGPU was selected with [`@init_parallel_kernel`](@ref). + - `nblocks::Tuple{Integer,Integer,Integer}`: the number of blocks to be used if the package CUDA or AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). + - `nthreads::Tuple{Integer,Integer,Integer}`: the number of threads to be used if the package CUDA or AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). # Keyword arguments !!! note "Advanced" @@ -24,7 +24,7 @@ Declare the `kernelcall` parallel. The kernel will automatically be called as re - `ad_mode=Enzyme.Reverse`: the automatic differentiation mode (see the documentation of Enzyme.jl for more information). - `ad_annotations=()`: Enzyme variable annotations for automatic differentiation in the format `(=, =, ...)`, where `` can be a single variable or a tuple of variables (e.g., `ad_annotations=(Duplicated=B, Active=(a,b))`). Currently supported annotations are: $(keys(AD_SUPPORTED_ANNOTATIONS)). - `configcall=kernelcall`: a call to a kernel that is declared parallel, which is used for determining the kernel launch parameters. This keyword is useful, e.g., for generic automatic differentiation using the low-level submodule [`AD`](@ref). - - `backendkwargs...`: keyword arguments to be passed further to CUDA or AMDGPU (ignored for Threads or Polyester). + - `backendkwargs...`: keyword arguments to be passed further to CUDA or AMDGPU or Metal (ignored for Threads or Polyester). !!! note "Performance note" Kernel launch parameters are automatically defined with heuristics, where not defined with optional kernel arguments. For CUDA and AMDGPU, `nthreads` is typically set to (32,8,1) and `nblocks` accordingly to ensure that enough threads are launched. diff --git a/src/ParallelStencil.jl b/src/ParallelStencil.jl index 25f173d5..1703cf7c 100644 --- a/src/ParallelStencil.jl +++ b/src/ParallelStencil.jl @@ -42,7 +42,7 @@ https://github.com/omlins/ParallelStencil.jl - [`Data`](@ref) !! note "Activation of GPU support" - The support for GPU (CUDA or AMDGPU) is provided with extensions and requires therefore an explicit installation of the corresponding packages (CUDA.jl or AMDGPU.jl). Note that it is not required to import explicitly the corresponding module (CUDA or AMDGPU); this is automatically done by [`@init_parallel_stencil`](@ref). + The support for GPU (CUDA or AMDGPU or Metal) is provided with extensions and requires therefore an explicit installation of the corresponding packages (CUDA.jl or AMDGPU.jl or Metal.jl). Note that it is not required to import explicitly the corresponding module (CUDA or AMDGPU or Metal); this is automatically done by [`@init_parallel_stencil`](@ref). To see a description of a macro or module type `?` (including the `@`) or `?`, respectively. """ diff --git a/src/init_parallel_stencil.jl b/src/init_parallel_stencil.jl index d1272d89..8c819645 100644 --- a/src/init_parallel_stencil.jl +++ b/src/init_parallel_stencil.jl @@ -28,7 +28,7 @@ Initialize the package ParallelStencil, giving access to its main functionality. Creates a module `Data` in the module where `@init_parallel_stencil` is called from. The module `Data` contains the types as `Data.Number`, `Data.Array` and `Data.CellArray` (type `?Data` *after* calling `@init_parallel_stencil` to see the full description of the module). # Arguments -- `package::Module`: the package used for parallelization (CUDA or AMDGPU for GPU, or Threads or Polyester for CPU). +- `package::Module`: the package used for parallelization (CUDA or AMDGPU or Metal for GPU, or Threads or Polyester for CPU). - `numbertype::DataType`: the type of numbers used by @zeros, @ones, @rand and @fill and in all array types of module `Data` (e.g. Float32 or Float64). It is contained in `Data.Number` after @init_parallel_stencil. The `numbertype` can be omitted if the other arguments are given as keyword arguments (in that case, the `numbertype` will have to be given explicitly when using the types provided by the module `Data`). - `ndims::Integer`: the number of dimensions used for the stencil computations in the kernels: 1, 2 or 3 (overwritable in each kernel definition). - `inbounds::Bool=false`: whether to apply `@inbounds` to the kernels by default (overwritable in each kernel definition). diff --git a/src/parallel.jl b/src/parallel.jl index 27a2a86b..468401be 100644 --- a/src/parallel.jl +++ b/src/parallel.jl @@ -34,8 +34,8 @@ Declare the `kernelcall` parallel. The kernel will automatically be called as re - `kernelcall`: a call to a kernel that is declared parallel. !!! note "Advanced optional arguments" - `ranges::Tuple{UnitRange{},UnitRange{},UnitRange{}} | Tuple{UnitRange{},UnitRange{}} | Tuple{UnitRange{}} | UnitRange{}`: the ranges of indices in each dimension for which computations must be performed. - - `nblocks::Tuple{Integer,Integer,Integer}`: the number of blocks to be used if the package CUDA or AMDGPU was selected with [`@init_parallel_kernel`](@ref). - - `nthreads::Tuple{Integer,Integer,Integer}`: the number of threads to be used if the package CUDA or AMDGPU was selected with [`@init_parallel_kernel`](@ref). + - `nblocks::Tuple{Integer,Integer,Integer}`: the number of blocks to be used if the package CUDA or AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). + - `nthreads::Tuple{Integer,Integer,Integer}`: the number of threads to be used if the package CUDA or AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). # Keyword arguments - `memopt::Bool=false`: whether the kernel to be launched was generated with `memopt=true` (meaning the keyword was set in the kernel declaration). @@ -44,7 +44,7 @@ Declare the `kernelcall` parallel. The kernel will automatically be called as re - `ad_mode=Enzyme.Reverse`: the automatic differentiation mode (see the documentation of Enzyme.jl for more information). - `ad_annotations=()`: Enzyme variable annotations for automatic differentiation in the format `(=, =, ...)`, where `` can be a single variable or a tuple of variables (e.g., `ad_annotations=(Duplicated=B, Active=(a,b))`). Currently supported annotations are: $(keys(AD_SUPPORTED_ANNOTATIONS)). - `configcall=kernelcall`: a call to a kernel that is declared parallel, which is used for determining the kernel launch parameters. This keyword is useful, e.g., for generic automatic differentiation using the low-level submodule [`AD`](@ref). - - `backendkwargs...`: keyword arguments to be passed further to CUDA or AMDGPU (ignored for Threads and Polyester). + - `backendkwargs...`: keyword arguments to be passed further to CUDA or AMDGPU or Metal (ignored for Threads and Polyester). !!! note "Performance note" Kernel launch parameters are automatically defined with heuristics, where not defined with optional kernel arguments. For CUDA and AMDGPU, `nthreads` is typically set to (32,8,1) and `nblocks` accordingly to ensure that enough threads are launched. From 1dfaf380c3a868f665e1e2ca5c54ba04823d7679 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 9 Oct 2024 17:33:35 +0200 Subject: [PATCH 21/46] Add more docs --- src/ParallelKernel/Data.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ParallelKernel/Data.jl b/src/ParallelKernel/Data.jl index 5798ed9d..02d0c5ec 100644 --- a/src/ParallelKernel/Data.jl +++ b/src/ParallelKernel/Data.jl @@ -84,12 +84,12 @@ The type of indices used in parallel kernels. -------------------------------------------------------------------------------- Data.Array{numbertype, ndims} -The datatype `Data.Array` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (Array for Threads or Polyester, CUDA.CuArray or CUDA.CuDeviceArray for CUDA and AMDGPU.ROCArray or AMDGPU.ROCDeviceArray for AMDGPU; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray and AMDGPU.ROCArray automatically to CUDA.CuDeviceArray and AMDGPU.ROCDeviceArray in kernels when required). +The datatype `Data.Array` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (Array for Threads or Polyester, CUDA.CuArray or CUDA.CuDeviceArray for CUDA, AMDGPU.ROCArray or AMDGPU.ROCDeviceArray for AMDGPU and Metal.MtlArray or Metal.MtlDeviceArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray, AMDGPU.ROCArray and Metal.MtlArray automatically to CUDA.CuDeviceArray, AMDGPU.ROCDeviceArray and Metal.MtlDeviceArray in kernels when required). -------------------------------------------------------------------------------- Data.CellArray{numbertype, ndims} -The datatype `Data.CellArray` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA and ROCCellArray or ROCDeviceCellArray for AMDGPU; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray in kernels when required). +The datatype `Data.CellArray` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA, ROCCellArray or ROCDeviceCellArray for AMDGPU and MetalCellArray or MetalDeviceCellArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray in kernels when required). -------------------------------------------------------------------------------- Data.Cell{numbertype, S} @@ -128,7 +128,7 @@ Expands to: `NTuple{N_tuple, Data.Cell{numbertype, S}}` | `NamedTuple{names, NTu The datatype `Data.DeviceArray` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (Array for Threads or Polyester, CUDA.CuDeviceArray for CUDA and AMDGPU.ROCDeviceArray for AMDGPU). !!! warning - This datatype is not intended for explicit manual usage. [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray and AMDGPU.ROCArray automatically to CUDA.CuDeviceArray and AMDGPU.ROCDeviceArray in kernels when required. + This datatype is not intended for explicit manual usage. [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray, AMDGPU.ROCArray and Metal.MtlArray automatically to CUDA.CuDeviceArray, AMDGPU.ROCDeviceArray and Metal.MtlDeviceArray in kernels when required. -------------------------------------------------------------------------------- Data.DeviceCellArray{numbertype, ndims} @@ -136,7 +136,7 @@ Expands to: `NTuple{N_tuple, Data.Cell{numbertype, S}}` | `NamedTuple{names, NTu The datatype `Data.DeviceCellArray` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuDeviceCellArray for CUDA and ROCDeviceCellArray for AMDGPU). !!! warning - This datatype is not intended for explicit manual usage. [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray and AMDGPU.ROCArray automatically to CUDA.CuDeviceArray and AMDGPU.ROCDeviceArray in kernels when required. + This datatype is not intended for explicit manual usage. [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray, AMDGPU.ROCArray and Metal.MtlArray automatically to CUDA.CuDeviceArray, AMDGPU.ROCDeviceArray and Metal.MtlDeviceArray in kernels when required. """ function Data_cuda(modulename::Symbol, numbertype::DataType, indextype::DataType) From e4d2f09896ed3dc71327e8caf451b55d2dcfbada Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 9 Oct 2024 17:40:19 +0200 Subject: [PATCH 22/46] Rollback litarals in macros --- src/FiniteDifferences.jl | 94 ++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/src/FiniteDifferences.jl b/src/FiniteDifferences.jl index 4ccc77c0..a5266c98 100644 --- a/src/FiniteDifferences.jl +++ b/src/FiniteDifferences.jl @@ -54,8 +54,8 @@ macro d(A) @expandargs(A); esc(:( $A[$ix+1] - $A[$ix] )) end macro d2(A) @expandargs(A); esc(:( ($A[$ixi+1] - $A[$ixi]) - ($A[$ixi] - $A[$ixi-1]) )) end macro all(A) @expandargs(A); esc(:( $A[$ix ] )) end macro inn(A) @expandargs(A); esc(:( $A[$ixi ] )) end -macro av(A) @expandargs(A); esc(:(($A[$ix] + $A[$ix+1] )/2 )) end -macro harm(A) @expandargs(A); esc(:(1/(1/$A[$ix] + 1/$A[$ix+1])*2 )) end +macro av(A) @expandargs(A); esc(:(($A[$ix] + $A[$ix+1] )*0.5 )) end +macro harm(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix] + 1.0/$A[$ix+1])*2.0 )) end macro maxloc(A) @expandargs(A); esc(:( max( max($A[$ixi-1], $A[$ixi+1]), $A[$ixi] ) )) end macro minloc(A) @expandargs(A); esc(:( min( min($A[$ixi-1], $A[$ixi+1]), $A[$ixi] ) )) end @@ -167,16 +167,16 @@ macro all(A) @expandargs(A); esc(:( $A[$ix ,$iy ] )) end macro inn(A) @expandargs(A); esc(:( $A[$ixi ,$iyi ] )) end macro inn_x(A) @expandargs(A); esc(:( $A[$ixi ,$iy ] )) end macro inn_y(A) @expandargs(A); esc(:( $A[$ix ,$iyi ] )) end -macro av(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] + $A[$ix,$iy+1] + $A[$ix+1,$iy+1])/4 )) end -macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] )/2 )) end -macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix ,$iy+1] )/2 )) end -macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ] + $A[$ix+1,$iyi ] )/2 )) end -macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ] + $A[$ixi ,$iy+1] )/2 )) end -macro harm(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ] + 1/$A[$ix+1,$iy ] + 1/$A[$ix,$iy+1] + 1/$A[$ix+1,$iy+1])*4 )) end -macro harm_xa(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ] + 1/$A[$ix+1,$iy ] )*2 )) end -macro harm_ya(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ] + 1/$A[$ix ,$iy+1] )*2 )) end -macro harm_xi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iyi ] + 1/$A[$ix+1,$iyi ] )*2 )) end -macro harm_yi(A) @expandargs(A); esc(:(1/(1/$A[$ixi ,$iy ] + 1/$A[$ixi ,$iy+1] )*2 )) end +macro av(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] + $A[$ix,$iy+1] + $A[$ix+1,$iy+1])*0.25 )) end +macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] )*0.5 )) end +macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix ,$iy+1] )*0.5 )) end +macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ] + $A[$ix+1,$iyi ] )*0.5 )) end +macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ] + $A[$ixi ,$iy+1] )*0.5 )) end +macro harm(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ] + 1.0/$A[$ix+1,$iy ] + 1.0/$A[$ix,$iy+1] + 1.0/$A[$ix+1,$iy+1])*4.0 )) end +macro harm_xa(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ] + 1.0/$A[$ix+1,$iy ] )*2.0 )) end +macro harm_ya(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ] + 1.0/$A[$ix ,$iy+1] )*2.0 )) end +macro harm_xi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iyi ] + 1.0/$A[$ix+1,$iyi ] )*2.0 )) end +macro harm_yi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iy ] + 1.0/$A[$ixi ,$iy+1] )*2.0 )) end macro maxloc(A) @expandargs(A); esc(:( max( max( max($A[$ixi-1,$iyi ], $A[$ixi+1,$iyi ]) , $A[$ixi ,$iyi ] ), max($A[$ixi ,$iyi-1], $A[$ixi ,$iyi+1]) ) )) end macro minloc(A) @expandargs(A); esc(:( min( min( min($A[$ixi-1,$iyi ], $A[$ixi+1,$iyi ]) , $A[$ixi ,$iyi ] ), @@ -342,47 +342,47 @@ macro inn_yz(A) @expandargs(A); esc(:( $A[$ix ,$iyi ,$izi ] )) end macro av(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] + $A[$ix+1,$iy+1,$iz ] + $A[$ix+1,$iy+1,$iz+1] + $A[$ix ,$iy+1,$iz+1] + $A[$ix ,$iy ,$iz+1] + - $A[$ix+1,$iy ,$iz+1] + $A[$ix ,$iy+1,$iz ] )/8)) end -macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] )/2 )) end -macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy+1,$iz ] )/2 )) end -macro av_za(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy ,$iz+1] )/2 )) end -macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ,$izi ] + $A[$ix+1,$iyi ,$izi ] )/2 )) end -macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ,$izi ] + $A[$ixi ,$iy+1,$izi ] )/2 )) end -macro av_zi(A) @expandargs(A); esc(:(($A[$ixi ,$iyi ,$iz ] + $A[$ixi ,$iyi ,$iz+1] )/2 )) end + $A[$ix+1,$iy ,$iz+1] + $A[$ix ,$iy+1,$iz ] )*0.125)) end +macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] )*0.5 )) end +macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy+1,$iz ] )*0.5 )) end +macro av_za(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy ,$iz+1] )*0.5 )) end +macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ,$izi ] + $A[$ix+1,$iyi ,$izi ] )*0.5 )) end +macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ,$izi ] + $A[$ixi ,$iy+1,$izi ] )*0.5 )) end +macro av_zi(A) @expandargs(A); esc(:(($A[$ixi ,$iyi ,$iz ] + $A[$ixi ,$iyi ,$iz+1] )*0.5 )) end macro av_xya(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] + - $A[$ix ,$iy+1,$iz ] + $A[$ix+1,$iy+1,$iz ] )/4 )) end + $A[$ix ,$iy+1,$iz ] + $A[$ix+1,$iy+1,$iz ] )*0.25 )) end macro av_xza(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix+1,$iy ,$iz ] + - $A[$ix ,$iy ,$iz+1] + $A[$ix+1,$iy ,$iz+1] )/4 )) end + $A[$ix ,$iy ,$iz+1] + $A[$ix+1,$iy ,$iz+1] )*0.25 )) end macro av_yza(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$iz ] + $A[$ix ,$iy+1,$iz ] + - $A[$ix ,$iy ,$iz+1] + $A[$ix ,$iy+1,$iz+1] )/4 )) end + $A[$ix ,$iy ,$iz+1] + $A[$ix ,$iy+1,$iz+1] )*0.25 )) end macro av_xyi(A) @expandargs(A); esc(:(($A[$ix ,$iy ,$izi ] + $A[$ix+1,$iy ,$izi ] + - $A[$ix ,$iy+1,$izi ] + $A[$ix+1,$iy+1,$izi ] )/4 )) end + $A[$ix ,$iy+1,$izi ] + $A[$ix+1,$iy+1,$izi ] )*0.25 )) end macro av_xzi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ,$iz ] + $A[$ix+1,$iyi ,$iz ] + - $A[$ix ,$iyi ,$iz+1] + $A[$ix+1,$iyi ,$iz+1] )/4 )) end + $A[$ix ,$iyi ,$iz+1] + $A[$ix+1,$iyi ,$iz+1] )*0.25 )) end macro av_yzi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ,$iz ] + $A[$ixi ,$iy+1,$iz ] + - $A[$ixi ,$iy ,$iz+1] + $A[$ixi ,$iy+1,$iz+1] )/4 )) end -macro harm(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + - 1/$A[$ix+1,$iy+1,$iz ] + 1/$A[$ix+1,$iy+1,$iz+1] + - 1/$A[$ix ,$iy+1,$iz+1] + 1/$A[$ix ,$iy ,$iz+1] + - 1/$A[$ix+1,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz ] )*8)) end -macro harm_xa(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] )*2 )) end -macro harm_ya(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy+1,$iz ] )*2 )) end -macro harm_za(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy ,$iz+1] )*2 )) end -macro harm_xi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iyi ,$izi ] + 1/$A[$ix+1,$iyi ,$izi ] )*2 )) end -macro harm_yi(A) @expandargs(A); esc(:(1/(1/$A[$ixi ,$iy ,$izi ] + 1/$A[$ixi ,$iy+1,$izi ] )*2 )) end -macro harm_zi(A) @expandargs(A); esc(:(1/(1/$A[$ixi ,$iyi ,$iz ] + 1/$A[$ixi ,$iyi ,$iz+1] )*2 )) end -macro harm_xya(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + - 1/$A[$ix ,$iy+1,$iz ] + 1/$A[$ix+1,$iy+1,$iz ] )*4 )) end -macro harm_xza(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + - 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix+1,$iy ,$iz+1] )*4 )) end -macro harm_yza(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy+1,$iz ] + - 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz+1] )*4 )) end -macro harm_xyi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iy ,$izi ] + 1/$A[$ix+1,$iy ,$izi ] + - 1/$A[$ix ,$iy+1,$izi ] + 1/$A[$ix+1,$iy+1,$izi ] )*4 )) end -macro harm_xzi(A) @expandargs(A); esc(:(1/(1/$A[$ix ,$iyi ,$iz ] + 1/$A[$ix+1,$iyi ,$iz ] + - 1/$A[$ix ,$iyi ,$iz+1] + 1/$A[$ix+1,$iyi ,$iz+1] )*4 )) end -macro harm_yzi(A) @expandargs(A); esc(:(1/(1/$A[$ixi ,$iy ,$iz ] + 1/$A[$ixi ,$iy+1,$iz ] + - 1/$A[$ixi ,$iy ,$iz+1] + 1/$A[$ixi ,$iy+1,$iz+1] )*4 )) end + $A[$ixi ,$iy ,$iz+1] + $A[$ixi ,$iy+1,$iz+1] )*0.25 )) end +macro harm(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] + + 1.0/$A[$ix+1,$iy+1,$iz ] + 1.0/$A[$ix+1,$iy+1,$iz+1] + + 1.0/$A[$ix ,$iy+1,$iz+1] + 1.0/$A[$ix ,$iy ,$iz+1] + + 1.0/$A[$ix+1,$iy ,$iz+1] + 1.0/$A[$ix ,$iy+1,$iz ] )*8.0)) end +macro harm_xa(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] )*2.0 )) end +macro harm_ya(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix ,$iy+1,$iz ] )*2.0 )) end +macro harm_za(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix ,$iy ,$iz+1] )*2.0 )) end +macro harm_xi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iyi ,$izi ] + 1.0/$A[$ix+1,$iyi ,$izi ] )*2.0 )) end +macro harm_yi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iy ,$izi ] + 1.0/$A[$ixi ,$iy+1,$izi ] )*2.0 )) end +macro harm_zi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iyi ,$iz ] + 1.0/$A[$ixi ,$iyi ,$iz+1] )*2.0 )) end +macro harm_xya(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] + + 1.0/$A[$ix ,$iy+1,$iz ] + 1.0/$A[$ix+1,$iy+1,$iz ] )*4.0 )) end +macro harm_xza(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] + + 1.0/$A[$ix ,$iy ,$iz+1] + 1.0/$A[$ix+1,$iy ,$iz+1] )*4.0 )) end +macro harm_yza(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix ,$iy+1,$iz ] + + 1.0/$A[$ix ,$iy ,$iz+1] + 1.0/$A[$ix ,$iy+1,$iz+1] )*4.0 )) end +macro harm_xyi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$izi ] + 1.0/$A[$ix+1,$iy ,$izi ] + + 1.0/$A[$ix ,$iy+1,$izi ] + 1.0/$A[$ix+1,$iy+1,$izi ] )*4.0 )) end +macro harm_xzi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iyi ,$iz ] + 1.0/$A[$ix+1,$iyi ,$iz ] + + 1.0/$A[$ix ,$iyi ,$iz+1] + 1.0/$A[$ix+1,$iyi ,$iz+1] )*4.0 )) end +macro harm_yzi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iy ,$iz ] + 1.0/$A[$ixi ,$iy+1,$iz ] + + 1.0/$A[$ixi ,$iy ,$iz+1] + 1.0/$A[$ixi ,$iy+1,$iz+1] )*4.0 )) end macro maxloc(A) @expandargs(A); esc(:( max( max( max( max($A[$ixi-1,$iyi ,$izi ], $A[$ixi+1,$iyi ,$izi ]) , $A[$ixi ,$iyi ,$izi ] ), max($A[$ixi ,$iyi-1,$izi ], $A[$ixi ,$iyi+1,$izi ]) ), max($A[$ixi ,$iyi ,$izi-1], $A[$ixi ,$iyi ,$izi+1]) ) )) end From 2520510623a758e379aba037ea3d957ad57d5ba0 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 9 Oct 2024 17:45:10 +0200 Subject: [PATCH 23/46] More rollbacks --- src/kernel_language.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernel_language.jl b/src/kernel_language.jl index cfc5c819..6c7e4dd2 100644 --- a/src/kernel_language.jl +++ b/src/kernel_language.jl @@ -150,12 +150,12 @@ $((:( $A_head = @sharedMem(eltype($A), (Int64($nx_l), Int64 for (A, s) in shmem_vars for (shmem_offset, nx_l, ny_l, A_head) = ((shmem_exprs[A][:offset], s[:nx_l], s[:ny_l], s[:A_head]),) )... ) -$((:( $reg = 0 # e.g. A_ixm1_iyp2_izp2 = 0 +$((:( $reg = 0.0 # e.g. A_ixm1_iyp2_izp2 = 0.0 ) for A in optvars for regs in values(regqueue_tails[A]) for reg in values(regs) )... ) -$((:( $reg = 0 # e.g. A_ixm1_iyp2_izp3 = 0 +$((:( $reg = 0.0 # e.g. A_ixm1_iyp2_izp3 = 0.0 ) for A in optvars for regs in values(regqueue_heads[A]) for reg in values(regs) )... From 5495c6df2c0d14655b9a7307283052298351db1b Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 9 Oct 2024 17:54:33 +0200 Subject: [PATCH 24/46] Fix harmonic macros --- src/FiniteDifferences.jl | 56 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/FiniteDifferences.jl b/src/FiniteDifferences.jl index a5266c98..4bddbadb 100644 --- a/src/FiniteDifferences.jl +++ b/src/FiniteDifferences.jl @@ -55,7 +55,7 @@ macro d2(A) @expandargs(A); esc(:( ($A[$ixi+1] - $A[$ixi]) - ($A[$ixi] - macro all(A) @expandargs(A); esc(:( $A[$ix ] )) end macro inn(A) @expandargs(A); esc(:( $A[$ixi ] )) end macro av(A) @expandargs(A); esc(:(($A[$ix] + $A[$ix+1] )*0.5 )) end -macro harm(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix] + 1.0/$A[$ix+1])*2.0 )) end +macro harm(A) @expandargs(A); esc(:(2/(1/$A[$ix] + 1/$A[$ix+1]) )) end macro maxloc(A) @expandargs(A); esc(:( max( max($A[$ixi-1], $A[$ixi+1]), $A[$ixi] ) )) end macro minloc(A) @expandargs(A); esc(:( min( min($A[$ixi-1], $A[$ixi+1]), $A[$ixi] ) )) end @@ -172,11 +172,11 @@ macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] )*0 macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix ,$iy+1] )*0.5 )) end macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ] + $A[$ix+1,$iyi ] )*0.5 )) end macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ] + $A[$ixi ,$iy+1] )*0.5 )) end -macro harm(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ] + 1.0/$A[$ix+1,$iy ] + 1.0/$A[$ix,$iy+1] + 1.0/$A[$ix+1,$iy+1])*4.0 )) end -macro harm_xa(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ] + 1.0/$A[$ix+1,$iy ] )*2.0 )) end -macro harm_ya(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ] + 1.0/$A[$ix ,$iy+1] )*2.0 )) end -macro harm_xi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iyi ] + 1.0/$A[$ix+1,$iyi ] )*2.0 )) end -macro harm_yi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iy ] + 1.0/$A[$ixi ,$iy+1] )*2.0 )) end +macro harm(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iy ] + 1/$A[$ix+1,$iy ] + 1/$A[$ix,$iy+1] + 1/$A[$ix+1,$iy+1]) )) end +macro harm_xa(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iy ] + 1/$A[$ix+1,$iy ] ) )) end +macro harm_ya(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iy ] + 1/$A[$ix ,$iy+1] ) )) end +macro harm_xi(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iyi ] + 1/$A[$ix+1,$iyi ] ) )) end +macro harm_yi(A) @expandargs(A); esc(:(2/(1/$A[$ixi ,$iy ] + 1/$A[$ixi ,$iy+1] ) )) end macro maxloc(A) @expandargs(A); esc(:( max( max( max($A[$ixi-1,$iyi ], $A[$ixi+1,$iyi ]) , $A[$ixi ,$iyi ] ), max($A[$ixi ,$iyi-1], $A[$ixi ,$iyi+1]) ) )) end macro minloc(A) @expandargs(A); esc(:( min( min( min($A[$ixi-1,$iyi ], $A[$ixi+1,$iyi ]) , $A[$ixi ,$iyi ] ), @@ -361,28 +361,28 @@ macro av_xzi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ,$iz ] + $A[$ix+1,$iyi $A[$ix ,$iyi ,$iz+1] + $A[$ix+1,$iyi ,$iz+1] )*0.25 )) end macro av_yzi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ,$iz ] + $A[$ixi ,$iy+1,$iz ] + $A[$ixi ,$iy ,$iz+1] + $A[$ixi ,$iy+1,$iz+1] )*0.25 )) end -macro harm(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] + - 1.0/$A[$ix+1,$iy+1,$iz ] + 1.0/$A[$ix+1,$iy+1,$iz+1] + - 1.0/$A[$ix ,$iy+1,$iz+1] + 1.0/$A[$ix ,$iy ,$iz+1] + - 1.0/$A[$ix+1,$iy ,$iz+1] + 1.0/$A[$ix ,$iy+1,$iz ] )*8.0)) end -macro harm_xa(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] )*2.0 )) end -macro harm_ya(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix ,$iy+1,$iz ] )*2.0 )) end -macro harm_za(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix ,$iy ,$iz+1] )*2.0 )) end -macro harm_xi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iyi ,$izi ] + 1.0/$A[$ix+1,$iyi ,$izi ] )*2.0 )) end -macro harm_yi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iy ,$izi ] + 1.0/$A[$ixi ,$iy+1,$izi ] )*2.0 )) end -macro harm_zi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iyi ,$iz ] + 1.0/$A[$ixi ,$iyi ,$iz+1] )*2.0 )) end -macro harm_xya(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] + - 1.0/$A[$ix ,$iy+1,$iz ] + 1.0/$A[$ix+1,$iy+1,$iz ] )*4.0 )) end -macro harm_xza(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix+1,$iy ,$iz ] + - 1.0/$A[$ix ,$iy ,$iz+1] + 1.0/$A[$ix+1,$iy ,$iz+1] )*4.0 )) end -macro harm_yza(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$iz ] + 1.0/$A[$ix ,$iy+1,$iz ] + - 1.0/$A[$ix ,$iy ,$iz+1] + 1.0/$A[$ix ,$iy+1,$iz+1] )*4.0 )) end -macro harm_xyi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iy ,$izi ] + 1.0/$A[$ix+1,$iy ,$izi ] + - 1.0/$A[$ix ,$iy+1,$izi ] + 1.0/$A[$ix+1,$iy+1,$izi ] )*4.0 )) end -macro harm_xzi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ix ,$iyi ,$iz ] + 1.0/$A[$ix+1,$iyi ,$iz ] + - 1.0/$A[$ix ,$iyi ,$iz+1] + 1.0/$A[$ix+1,$iyi ,$iz+1] )*4.0 )) end -macro harm_yzi(A) @expandargs(A); esc(:(1.0/(1.0/$A[$ixi ,$iy ,$iz ] + 1.0/$A[$ixi ,$iy+1,$iz ] + - 1.0/$A[$ixi ,$iy ,$iz+1] + 1.0/$A[$ixi ,$iy+1,$iz+1] )*4.0 )) end +macro harm(A) @expandargs(A); esc(:(8/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + + 1/$A[$ix+1,$iy+1,$iz ] + 1/$A[$ix+1,$iy+1,$iz+1] + + 1/$A[$ix ,$iy+1,$iz+1] + 1/$A[$ix ,$iy ,$iz+1] + + 1/$A[$ix+1,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz ] ) )) end +macro harm_xa(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] ) )) end +macro harm_ya(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy+1,$iz ] ) )) end +macro harm_za(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy ,$iz+1] ) )) end +macro harm_xi(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iyi ,$izi ] + 1/$A[$ix+1,$iyi ,$izi ] ) )) end +macro harm_yi(A) @expandargs(A); esc(:(2/(1/$A[$ixi ,$iy ,$izi ] + 1/$A[$ixi ,$iy+1,$izi ] ) )) end +macro harm_zi(A) @expandargs(A); esc(:(2/(1/$A[$ixi ,$iyi ,$iz ] + 1/$A[$ixi ,$iyi ,$iz+1] ) )) end +macro harm_xya(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + + 1/$A[$ix ,$iy+1,$iz ] + 1/$A[$ix+1,$iy+1,$iz ] ) )) end +macro harm_xza(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + + 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix+1,$iy ,$iz+1] ) )) end +macro harm_yza(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy+1,$iz ] + + 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz+1] ) )) end +macro harm_xyi(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iy ,$izi ] + 1/$A[$ix+1,$iy ,$izi ] + + 1/$A[$ix ,$iy+1,$izi ] + 1/$A[$ix+1,$iy+1,$izi ] ) )) end +macro harm_xzi(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iyi ,$iz ] + 1/$A[$ix+1,$iyi ,$iz ] + + 1/$A[$ix ,$iyi ,$iz+1] + 1/$A[$ix+1,$iyi ,$iz+1] ) )) end +macro harm_yzi(A) @expandargs(A); esc(:(4/(1/$A[$ixi ,$iy ,$iz ] + 1/$A[$ixi ,$iy+1,$iz ] + + 1/$A[$ixi ,$iy ,$iz+1] + 1/$A[$ixi ,$iy+1,$iz+1] ) )) end macro maxloc(A) @expandargs(A); esc(:( max( max( max( max($A[$ixi-1,$iyi ,$izi ], $A[$ixi+1,$iyi ,$izi ]) , $A[$ixi ,$iyi ,$izi ] ), max($A[$ixi ,$iyi-1,$izi ], $A[$ixi ,$iyi+1,$izi ]) ), max($A[$ixi ,$iyi ,$izi-1], $A[$ixi ,$iyi ,$izi+1]) ) )) end From f8c751f3e3c5177606ea4e395f8e0ad8974e8079 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Thu, 10 Oct 2024 10:37:57 +0200 Subject: [PATCH 25/46] Partially fix rand metal --- src/ParallelKernel/Data.jl | 18 +-- src/ParallelKernel/MetalExt/allocators.jl | 10 +- src/ParallelKernel/MetalExt/shared.jl | 3 +- test/ParallelKernel/test_allocators.jl | 128 +++++++++++++--------- 4 files changed, 91 insertions(+), 68 deletions(-) diff --git a/src/ParallelKernel/Data.jl b/src/ParallelKernel/Data.jl index 02d0c5ec..736f0339 100644 --- a/src/ParallelKernel/Data.jl +++ b/src/ParallelKernel/Data.jl @@ -21,7 +21,7 @@ Expands to `Data.Array{numbertype, ndims}`, where `numbertype` is the datatype s -------------------------------------------------------------------------------- Data.CellArray{ndims} -Expands to `Data.CellArray{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.CellArray` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA, ROCCellArray or ROCDeviceCellArray for AMDGPU and MetalCellArray or MetalDeviceCellArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray when required). +Expands to `Data.CellArray{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.CellArray` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA, ROCCellArray or ROCDeviceCellArray for AMDGPU and MtlCellArray or MtlDeviceCellArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray when required). -------------------------------------------------------------------------------- Data.Cell{S} @@ -89,7 +89,7 @@ The datatype `Data.Array` is automatically chosen to be compatible with the pack -------------------------------------------------------------------------------- Data.CellArray{numbertype, ndims} -The datatype `Data.CellArray` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA, ROCCellArray or ROCDeviceCellArray for AMDGPU and MetalCellArray or MetalDeviceCellArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray in kernels when required). +The datatype `Data.CellArray` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA, ROCCellArray or ROCDeviceCellArray for AMDGPU and MtlCellArray or MetalDeviceCellArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray in kernels when required). -------------------------------------------------------------------------------- Data.Cell{numbertype, S} @@ -231,23 +231,23 @@ function Data_metal(modulename::Symbol, numbertype::DataType, indextype::DataTyp Data_module = if (numbertype == NUMBERTYPE_NONE) :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail. import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays - # TODO: the constructors defined by CellArrays.@define_ROCCellArray lead to pre-compilation issues due to a bug in Julia. We therefore only create the type alias here for now. - const MetalCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,Metal.MtlArray{T_elem,CellArrays._N}} + # TODO: the constructors defined by CellArrays.@define_MtlCellArray lead to pre-compilation issues due to a bug in Julia. We therefore only create the type alias here for now. + const MtlCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,Metal.MtlArray{T_elem,CellArrays._N}} # const Index = $indextype const Array{T, N} = Metal.MtlArray{T, N} const DeviceArray{T, N} = Metal.MtlDeviceArray{T, N} const Cell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} const DeviceCell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} - const CellArray{T_elem, N, B} = MetalCellArray{<:Cell{T_elem},N,B,T_elem} + const CellArray{T_elem, N, B} = MtlCellArray{<:Cell{T_elem},N,B,T_elem} const DeviceCellArray{T_elem, N, B} = CellArrays.CellArray{<:DeviceCell{T_elem},N,B,<:Metal.MtlDeviceArray{T_elem,CellArrays._N}} $(create_shared_exprs(numbertype, indextype)) end) else :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail. import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays - # TODO: the constructors defined by CellArrays.@define_ROCCellArray lead to pre-compilation issues due to a bug in Julia. We therefore only create the type alias here for now. - const MetalCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,Metal.MtlArray{T_elem,CellArrays._N}} + # TODO: the constructors defined by CellArrays.@define_MtlCellArray lead to pre-compilation issues due to a bug in Julia. We therefore only create the type alias here for now. + const MtlCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,Metal.MtlArray{T_elem,CellArrays._N}} # const Index = $indextype const Number = $numbertype @@ -255,13 +255,13 @@ function Data_metal(modulename::Symbol, numbertype::DataType, indextype::DataTyp const DeviceArray{N} = Metal.MtlDeviceArray{$numbertype, N} const Cell{S} = Union{StaticArrays.SArray{S, $numbertype}, StaticArrays.FieldArray{S, $numbertype}} const DeviceCell{S} = Union{StaticArrays.SArray{S, $numbertype}, StaticArrays.FieldArray{S, $numbertype}} - const CellArray{N, B} = MetalCellArray{<:Cell,N,B,$numbertype} + const CellArray{N, B} = MtlCellArray{<:Cell,N,B,$numbertype} const DeviceCellArray{N, B} = CellArrays.CellArray{<:DeviceCell,N,B,<:Metal.MtlDeviceArray{$numbertype,CellArrays._N}} const TArray{T, N} = Metal.MtlArray{T, N} const DeviceTArray{T, N} = Metal.MtlDeviceArray{T, N} const TCell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} const DeviceTCell{T, S} = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}} - const TCellArray{T_elem, N, B} = MetalCellArray{<:TCell{T_elem},N,B,T_elem} + const TCellArray{T_elem, N, B} = MtlCellArray{<:TCell{T_elem},N,B,T_elem} const DeviceTCellArray{T_elem, N, B} = CellArrays.CellArray{<:DeviceTCell{T_elem},N,B,<:Metal.MtlDeviceArray{T_elem,CellArrays._N}} $(create_shared_exprs(numbertype, indextype)) end) diff --git a/src/ParallelKernel/MetalExt/allocators.jl b/src/ParallelKernel/MetalExt/allocators.jl index e207d9d2..f2251f51 100644 --- a/src/ParallelKernel/MetalExt/allocators.jl +++ b/src/ParallelKernel/MetalExt/allocators.jl @@ -2,17 +2,17 @@ ParallelStencil.ParallelKernel.zeros_metal(::Type{T}, blocklength, args...) where {T<:Number} = (check_datatype_metal(T); Metal.zeros(T, args...)) # (blocklength is ignored if neither celldims nor celltype is set) ParallelStencil.ParallelKernel.ones_metal(::Type{T}, blocklength, args...) where {T<:Number} = (check_datatype_metal(T); Metal.ones(T, args...)) -ParallelStencil.ParallelKernel.rand_metal(::Type{T}, blocklength, args...) where {T<:Union{Number,Enum}} = (check_datatype_metal(T); MtlArray(rand_cpu(T, blocklength, args...))) +ParallelStencil.ParallelKernel.rand_metal(::Type{T}, blocklength, args...) where {T<:Union{Number,Enum}} = MtlArray(rand_cpu(T, blocklength, args...)) ParallelStencil.ParallelKernel.falses_metal(::Type{T}, blocklength, args...) where {T<:Bool} = Metal.falses(args...) ParallelStencil.ParallelKernel.trues_metal(::Type{T}, blocklength, args...) where {T<:Bool} = Metal.trues(args...) ParallelStencil.ParallelKernel.fill_metal(::Type{T}, blocklength, args...) where {T<:Union{Number,Enum}} = MtlArray(fill_cpu(T, blocklength, args...)) -ParallelStencil.ParallelKernel.zeros_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = (check_datatype_metal(T); fill_metal(T, blocklength, 0, args...)) +ParallelStencil.ParallelKernel.zeros_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = (check_datatype_metal(T); fill_metal(T, blocklength, 0, args...)) ParallelStencil.ParallelKernel.ones_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = (check_datatype_metal(T); fill_metal(T, blocklength, 1, args...)) -ParallelStencil.ParallelKernel.rand_metal(::Type{T}, ::Val{B}, dims) where {T<:Union{SArray,FieldArray}, B} = (check_datatype_metal(T, Bool, Enum); blocklen = (B == 0) ? prod(dims) : B; CellArray{T,length(dims),B, Metal.MtlArray{eltype(T),3}}(Metal.MtlArray(Base.rand(eltype(T), blocklen, prod(size(T)), ceil(Int,prod(dims)/(blocklen))), dims))) +ParallelStencil.ParallelKernel.rand_metal(::Type{T}, ::Val{B}, dims) where {T<:Union{SArray,FieldArray}, B} = (check_datatype_metal(T, Bool, Enum); blocklen = (B == 0) ? prod(dims) : B; CellArray{T,length(dims),B, Metal.MtlArray{eltype(T),3}}(Metal.rand(eltype(T), blocklen, prod(size(T)), ceil(Int,prod(dims)/(blocklen))), dims)) ParallelStencil.ParallelKernel.rand_metal(::Type{T}, blocklength, dims...) where {T<:Union{SArray,FieldArray}} = rand_metal(T, blocklength, dims) -ParallelStencil.ParallelKernel.falses_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = fill_metal(T, blocklength, false, args...) -ParallelStencil.ParallelKernel.trues_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = fill_metal(T, blocklength, true, args...) +ParallelStencil.ParallelKernel.falses_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = fill_metal(T, blocklength, false, args...) +ParallelStencil.ParallelKernel.trues_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}} = fill_metal(T, blocklength, true, args...) function ParallelStencil.ParallelKernel.fill_metal(::Type{T}, ::Val{B}, x, args...) where {T <: Union{SArray,FieldArray}, B} if (!(eltype(x) <: Number) || (eltype(x) == Bool)) && (eltype(x) != eltype(T)) @ArgumentError("fill: the (element) type of argument 'x' is not a normal number type ($(eltype(x))), but does not match the obtained (default) 'eltype' ($(eltype(T))); automatic conversion to $(eltype(T)) is therefore not attempted. Set the keyword argument 'eltype' accordingly to the element type of 'x' or pass an 'x' of a different (element) type.") end diff --git a/src/ParallelKernel/MetalExt/shared.jl b/src/ParallelKernel/MetalExt/shared.jl index ffcb011f..8387dc37 100644 --- a/src/ParallelKernel/MetalExt/shared.jl +++ b/src/ParallelKernel/MetalExt/shared.jl @@ -4,8 +4,7 @@ using ParallelStencil.ParallelKernel.Exceptions using Metal, CellArrays, StaticArrays import Metal.MTL -## TODO add Metal backend for CellArray -# @define_MetalCellArray +@define_MtlCellArray ## FUNCTIONS TO CHECK EXTENSIONS SUPPORT ParallelStencil.ParallelKernel.is_loaded(::Val{:ParallelStencil_MetalExt}) = true diff --git a/test/ParallelKernel/test_allocators.jl b/test/ParallelKernel/test_allocators.jl index a4abfa4b..6f990b72 100644 --- a/test/ParallelKernel/test_allocators.jl +++ b/test/ParallelKernel/test_allocators.jl @@ -20,7 +20,7 @@ end @static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end - #@define_MetalCellArray + @define_MtlCellArray end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester @@ -147,8 +147,10 @@ eval(:( @test typeof(@ones(2,3, eltype=Float32)) == typeof(Metal.MtlArray(ones(Float32,2,3))) @test typeof(@ones(2,3, eltype=DATA_INDEX)) == typeof(Metal.MtlArray(ones(DATA_INDEX,2,3))) @test typeof(@rand(2,3)) == typeof(Metal.MtlArray(rand(Float16,2,3))) + @test typeof(@rand(2,3, eltype=Float32)) == typeof(Metal.MtlArray(rand(Float32,2,3))) @test typeof(@rand(2,3, eltype=DATA_INDEX)) == typeof(Metal.MtlArray(rand(DATA_INDEX,2,3))) @test typeof(@fill(9, 2,3)) == typeof(Metal.MtlArray(fill(convert(Float16, 9), 2,3))) + @test typeof(@fill(9, 2,3, eltype=Float32)) == typeof(Metal.MtlArray(fill(convert(Float32, 9), 2,3))) @test typeof(@fill(9, 2,3, eltype=DATA_INDEX)) == typeof(Metal.MtlArray(fill(convert(DATA_INDEX, 9), 2,3))) else @test typeof(@zeros(2,3)) == typeof(parentmodule($package).zeros(Float16,2,3)) @@ -202,15 +204,19 @@ eval(:( @test @zeros(2,3, celldims=(3,4), eltype=DATA_INDEX) == CellArrays.fill!(ROCCellArray{T_Index}(undef,2,3), T_Index(zeros((3,4)))) AMDGPU.allowscalar(false) #TODO: check how to do elseif $package == $PKG_METAL - # @test @zeros(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Float16}(undef,2,3), T_Float16(zeros((3,4)))) - # @test @zeros(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MtlCellArray{T_Float32}(undef,2,3), T_Float32(zeros((3,4)))) - # @test @ones(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Float16}(undef,2,3), T_Float16(ones((3,4)))) - # @test @ones(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MtlCellArray{T_Float32}(undef,2,3), T_Float32(ones((3,4)))) - # @test typeof(@rand(2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Float16,0}(undef,2,3)) - # @test typeof(@fill(9, 2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Float16,0}(undef,2,3)) - # @test @falses(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Bool}(undef,2,3), falses((3,4))) - # @test @trues(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Bool}(undef,2,3), trues((3,4))) - # @test @zeros(2,3, celldims=(3,4), eltype=DATA_INDEX) == CellArrays.fill!(MtlCellArray{T_Index}(undef,2,3), T_Index(zeros((3,4)))) + Metal.allowscalar(true) + @test @zeros(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Float16}(undef,2,3), T_Float16(zeros((3,4)))) + @test @zeros(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MtlCellArray{T_Float32}(undef,2,3), T_Float32(zeros((3,4)))) + @test @ones(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Float16}(undef,2,3), T_Float16(ones((3,4)))) + @test @ones(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MtlCellArray{T_Float32}(undef,2,3), T_Float32(ones((3,4)))) + @test typeof(@rand(2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Float16,0}(undef,2,3)) + @test typeof(@rand(2,3, celldims=(3,4), eltype=Float32)) == typeof(MtlCellArray{T_Float32,0}(undef,2,3)) + @test typeof(@fill(9, 2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Float16,0}(undef,2,3)) + @test typeof(@fill(9, 2,3, celldims=(3,4), eltype=Float32)) == typeof(MtlCellArray{T_Float32,0}(undef,2,3)) + @test @falses(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Bool}(undef,2,3), falses((3,4))) + @test @trues(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Bool}(undef,2,3), trues((3,4))) + @test @zeros(2,3, celldims=(3,4), eltype=DATA_INDEX) == CellArrays.fill!(MtlCellArray{T_Index}(undef,2,3), T_Index(zeros((3,4)))) + Metal.allowscalar(false) else @test @zeros(2,3, celldims=(3,4)) == CellArrays.fill!(CPUCellArray{T_Float16}(undef,2,3), T_Float16(zeros((3,4)))) @test @zeros(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(CPUCellArray{T_Float32}(undef,2,3), T_Float32(zeros((3,4)))) @@ -251,14 +257,17 @@ eval(:( @test @zeros(2,3, celltype=SymmetricTensor2D_Index) == CellArrays.fill!(ROCCellArray{SymmetricTensor2D_Index}(undef,2,3), SymmetricTensor2D_Index(zeros(3))) AMDGPU.allowscalar(false) elseif $package == $PKG_METAL - # @test @zeros(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(zeros(3))) - # @test @zeros(2,3, celltype=SymmetricTensor3D) == CellArrays.fill!(MtlCellArray{SymmetricTensor3D}(undef,2,3), SymmetricTensor3D(zeros(6))) - # @test @zeros(2,3, celltype=Tensor2D) == CellArrays.fill!(MtlCellArray{Tensor2D}(undef,2,3), Tensor2D(zeros((2,2,2,2)))) - # @test @zeros(2,3, celltype=SymmetricTensor2D_Float32) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D_Float32}(undef,2,3), SymmetricTensor2D_Float32(zeros(3))) - # @test @ones(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(ones(3))) - # @test typeof(@rand(2,3, celltype=SymmetricTensor2D)) == typeof(MtlCellArray{SymmetricTensor2D,0}(undef,2,3)) - # @test typeof(@fill(9, 2,3, celltype=SymmetricTensor2D)) == typeof(MtlCellArray{SymmetricTensor2D,0}(undef,2,3)) - # @test @zeros(2,3, celltype=SymmetricTensor2D_Index) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D_Index}(undef,2,3), SymmetricTensor2D_Index(zeros(3))) + Metal.allowscalar(true) + @test @zeros(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(zeros(3))) + @test @zeros(2,3, celltype=SymmetricTensor3D) == CellArrays.fill!(MtlCellArray{SymmetricTensor3D}(undef,2,3), SymmetricTensor3D(zeros(6))) + @test @zeros(2,3, celltype=Tensor2D) == CellArrays.fill!(MtlCellArray{Tensor2D}(undef,2,3), Tensor2D(zeros((2,2,2,2)))) + @test @zeros(2,3, celltype=SymmetricTensor2D_T{Float32}) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D_T{Float32}}(undef,2,3), SymmetricTensor2D_T{Float64}(zeros(3))) + @test @zeros(2,3, celltype=SymmetricTensor2D_Float32) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D_Float32}(undef,2,3), SymmetricTensor2D_Float32(zeros(3))) + @test @ones(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(ones(3))) + @test typeof(@rand(2,3, celltype=SymmetricTensor2D)) == typeof(MtlCellArray{SymmetricTensor2D,0}(undef,2,3)) + @test typeof(@fill(9, 2,3, celltype=SymmetricTensor2D)) == typeof(MtlCellArray{SymmetricTensor2D,0}(undef,2,3)) + @test @zeros(2,3, celltype=SymmetricTensor2D_Index) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D_Index}(undef,2,3), SymmetricTensor2D_Index(zeros(3))) + Metal.allowscalar(false) else @test @zeros(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(CPUCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(zeros(3))) @test @zeros(2,3, celltype=SymmetricTensor3D) == CellArrays.fill!(CPUCellArray{SymmetricTensor3D}(undef,2,3), SymmetricTensor3D(zeros(6))) @@ -308,6 +317,8 @@ eval(:( elseif $package == $PKG_METAL @test typeof(@zeros(2,3, eltype=Float32)) == typeof(Metal.MtlArray(zeros(Float32,2,3))) @test typeof(@ones(2,3, eltype=Float32)) == typeof(Metal.MtlArray(ones(Float32,2,3))) + @test typeof(@rand(2,3, eltype=Float32)) == typeof(Metal.MtlArray(rand(Float32,2,3))) + @test typeof(@fill(9, 2,3, eltype=Float32)) == typeof(Metal.MtlArray(fill(convert(Float32, 9), 2,3))) @test typeof(@zeros(2,3, eltype=DATA_INDEX)) == typeof(Metal.MtlArray(zeros(DATA_INDEX,2,3))) else @test typeof(@zeros(2,3, eltype=Float32)) == typeof(zeros(Float32,2,3)) @@ -343,10 +354,14 @@ eval(:( @test @trues(2,3, celldims=(3,4)) == CellArrays.fill!(ROCCellArray{T_Bool}(undef,2,3), trues((3,4))) AMDGPU.allowscalar(false) elseif $package == $PKG_METAL - # @test @zeros(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MetalCellArray{T_Float32}(undef,2,3), T_Float32(zeros((3,4)))) - # @test @ones(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MetalCellArray{T_Float32}(undef,2,3), T_Float32(ones((3,4)))) - # @test @falses(2,3, celldims=(3,4)) == CellArrays.fill!(MetalCellArray{T_Bool}(undef,2,3), falses((3,4))) - # @test @trues(2,3, celldims=(3,4)) == CellArrays.fill!(MetalCellArray{T_Bool}(undef,2,3), trues((3,4))) + Metal.allowscalar(true) + @test @zeros(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MtlCellArray{T_Float32}(undef,2,3), T_Float32(zeros((3,4)))) + @test @ones(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(MtlCellArray{T_Float32}(undef,2,3), T_Float32(ones((3,4)))) + @test typeof(@rand(2,3, celldims=(3,4), eltype=Float32)) == typeof(MtlCellArray{T_Float32,0}(undef,2,3)) + @test typeof(@fill(9, 2,3, celldims=(3,4), eltype=Float32)) == typeof(MtlCellArray{T_Float32,0}(undef,2,3)) + @test @falses(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Bool}(undef,2,3), falses((3,4))) + @test @trues(2,3, celldims=(3,4)) == CellArrays.fill!(MtlCellArray{T_Bool}(undef,2,3), trues((3,4))) + Metal.allowscalar(false) else @test @zeros(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(CPUCellArray{T_Float32}(undef,2,3), T_Float32(zeros((3,4)))) @test @ones(2,3, celldims=(3,4), eltype=Float32) == CellArrays.fill!(CPUCellArray{T_Float32}(undef,2,3), T_Float32(ones((3,4)))) @@ -380,13 +395,16 @@ eval(:( @test typeof(@fill(9, 2,3, celltype=SymmetricTensor2D)) == typeof(ROCCellArray{SymmetricTensor2D,0}(undef,2,3)) AMDGPU.allowscalar(false) elseif $package == $PKG_METAL - # @test @zeros(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MetalCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(zeros(3))) - # @test @zeros(2,3, celltype=SymmetricTensor3D) == CellArrays.fill!(MetalCellArray{SymmetricTensor3D}(undef,2,3), SymmetricTensor3D(zeros(6))) - # @test @zeros(2,3, celltype=Tensor2D) == CellArrays.fill!(MetalCellArray{Tensor2D}(undef,2,3), Tensor2D(zeros((2,2,2,2)))) - # @test @zeros(2,3, celltype=SymmetricTensor2D_Float32) == CellArrays.fill!(MetalCellArray{SymmetricTensor2D_Float32}(undef,2,3), SymmetricTensor2D_Float32(zeros(3))) - # @test @ones(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MetalCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(ones(3))) - # @test typeof(@rand(2,3, celltype=SymmetricTensor2D)) == typeof(MetalCellArray{SymmetricTensor2D,0}(undef,2,3)) - # @test typeof(@fill(9, 2,3, celltype=SymmetricTensor2D)) == typeof(MetalCellArray{SymmetricTensor2D,0}(undef,2,3)) + Metal.allowscalar(true) + @test @zeros(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(zeros(3))) + @test @zeros(2,3, celltype=SymmetricTensor3D) == CellArrays.fill!(MtlCellArray{SymmetricTensor3D}(undef,2,3), SymmetricTensor3D(zeros(6))) + @test @zeros(2,3, celltype=Tensor2D) == CellArrays.fill!(MtlCellArray{Tensor2D}(undef,2,3), Tensor2D(zeros((2,2,2,2)))) + @test @zeros(2,3, celltype=SymmetricTensor2D_T{Float32}) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D_T{Float32}}(undef,2,3), SymmetricTensor2D_T{Float32}(zeros(3))) + @test @zeros(2,3, celltype=SymmetricTensor2D_Float32) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D_Float32}(undef,2,3), SymmetricTensor2D_Float32(zeros(3))) + @test @ones(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(MtlCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(ones(3))) + @test typeof(@rand(2,3, celltype=SymmetricTensor2D)) == typeof(MtlCellArray{SymmetricTensor2D,0}(undef,2,3)) + @test typeof(@fill(9, 2,3, celltype=SymmetricTensor2D)) == typeof(MtlCellArray{SymmetricTensor2D,0}(undef,2,3)) + Metal.allowscalar(false) else @test @zeros(2,3, celltype=SymmetricTensor2D) == CellArrays.fill!(CPUCellArray{SymmetricTensor2D}(undef,2,3), SymmetricTensor2D(zeros(3))) @test @zeros(2,3, celltype=SymmetricTensor3D) == CellArrays.fill!(CPUCellArray{SymmetricTensor3D}(undef,2,3), SymmetricTensor3D(zeros(6))) @@ -426,12 +444,14 @@ eval(:( @test typeof( @trues(2,3, celldims=(3,4))) == typeof(ROCCellArray{T_Bool, 0}(undef,2,3)) AMDGPU.allowscalar(false) elseif $package == $PKG_METAL - # @test typeof( @zeros(2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Float16,0}(undef,2,3)) - # @test typeof( @ones(2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Float16,0}(undef,2,3)) - # @test typeof( @rand(2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Float16,0}(undef,2,3)) - # @test typeof(@fill(9, 2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Float16,0}(undef,2,3)) - # @test typeof( @falses(2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Bool, 0}(undef,2,3)) - # @test typeof( @trues(2,3, celldims=(3,4))) == typeof(MetalCellArray{T_Bool, 0}(undef,2,3)) + Metal.allowscalar(true) + @test typeof( @zeros(2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Float16,0}(undef,2,3)) + @test typeof( @ones(2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Float16,0}(undef,2,3)) + @test typeof( @rand(2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Float16,0}(undef,2,3)) + @test typeof(@fill(9, 2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Float16,0}(undef,2,3)) + @test typeof( @falses(2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Bool, 0}(undef,2,3)) + @test typeof( @trues(2,3, celldims=(3,4))) == typeof(MtlCellArray{T_Bool, 0}(undef,2,3)) + Metal.allowscalar(false) else @test typeof( @zeros(2,3, celldims=(3,4))) == typeof(CPUCellArray{T_Float16,1}(undef,2,3)) @test typeof( @ones(2,3, celldims=(3,4))) == typeof(CPUCellArray{T_Float16,1}(undef,2,3)) @@ -473,18 +493,20 @@ eval(:( @test typeof( @trues(2,3, celldims=(3,4), blocklength=3)) == typeof(ROCCellArray{T_Bool, 3}(undef,2,3)) AMDGPU.allowscalar(false) elseif $package == $PKG_METAL - # @test typeof( @zeros(2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Float16,1}(undef,2,3)) - # @test typeof( @ones(2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Float16,1}(undef,2,3)) - # @test typeof( @rand(2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Float16,1}(undef,2,3)) - # @test typeof(@fill(9, 2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Float16,1}(undef,2,3)) - # @test typeof( @falses(2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Bool, 1}(undef,2,3)) - # @test typeof( @trues(2,3, celldims=(3,4), blocklength=1)) == typeof(MetalCellArray{T_Bool, 1}(undef,2,3)) - # @test typeof( @zeros(2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Float16,3}(undef,2,3)) - # @test typeof( @ones(2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Float16,3}(undef,2,3)) - # @test typeof( @rand(2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Float16,3}(undef,2,3)) - # @test typeof(@fill(9, 2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Float16,3}(undef,2,3)) - # @test typeof( @falses(2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Bool, 3}(undef,2,3)) - # @test typeof( @trues(2,3, celldims=(3,4), blocklength=3)) == typeof(MetalCellArray{T_Bool, 3}(undef,2,3)) + Metal.allowscalar(true) + @test typeof( @zeros(2,3, celldims=(3,4), blocklength=1)) == typeof(MtlCellArray{T_Float16,1}(undef,2,3)) + @test typeof( @ones(2,3, celldims=(3,4), blocklength=1)) == typeof(MtlCellArray{T_Float16,1}(undef,2,3)) + @test typeof( @rand(2,3, celldims=(3,4), blocklength=1)) == typeof(MtlCellArray{T_Float16,1}(undef,2,3)) + @test typeof(@fill(9, 2,3, celldims=(3,4), blocklength=1)) == typeof(MtlCellArray{T_Float16,1}(undef,2,3)) + @test typeof( @falses(2,3, celldims=(3,4), blocklength=1)) == typeof(MtlCellArray{T_Bool, 1}(undef,2,3)) + @test typeof( @trues(2,3, celldims=(3,4), blocklength=1)) == typeof(MtlCellArray{T_Bool, 1}(undef,2,3)) + @test typeof( @zeros(2,3, celldims=(3,4), blocklength=3)) == typeof(MtlCellArray{T_Float16,3}(undef,2,3)) + @test typeof( @ones(2,3, celldims=(3,4), blocklength=3)) == typeof(MtlCellArray{T_Float16,3}(undef,2,3)) + @test typeof( @rand(2,3, celldims=(3,4), blocklength=3)) == typeof(MtlCellArray{T_Float16,3}(undef,2,3)) + @test typeof(@fill(9, 2,3, celldims=(3,4), blocklength=3)) == typeof(MtlCellArray{T_Float16,3}(undef,2,3)) + @test typeof( @falses(2,3, celldims=(3,4), blocklength=3)) == typeof(MtlCellArray{T_Bool, 3}(undef,2,3)) + @test typeof( @trues(2,3, celldims=(3,4), blocklength=3)) == typeof(MtlCellArray{T_Bool, 3}(undef,2,3)) + Metal.allowscalar(false) else @test typeof( @zeros(2,3, celldims=(3,4), blocklength=0)) == typeof(CPUCellArray{T_Float16,0}(undef,2,3)) @test typeof( @ones(2,3, celldims=(3,4), blocklength=0)) == typeof(CPUCellArray{T_Float16,0}(undef,2,3)) @@ -525,11 +547,13 @@ eval(:( @test typeof(@fill(@rand(3,4,eltype=Phase), 2,3, celldims=(3,4), eltype=Phase)) == typeof(ROCCellArray{T_Phase,0}(undef,2,3)) AMDGPU.allowscalar(false) elseif $package == $PKG_METAL - # @test typeof(@rand(2,3, eltype=Phase)) == typeof(Metal.MtlArray(rand(Phase, 2,3))) - # @test typeof(@rand(2,3, celldims=(3,4), eltype=Phase)) == typeof(MetalCellArray{T_Phase,0}(undef,2,3)) - # @test typeof(@fill(solid, 2,3, eltype=Phase)) == typeof(Metal.MtlArray(rand(Phase, 2,3))) - # @test typeof(@fill(solid, 2,3, celldims=(3,4), eltype=Phase)) == typeof(MetalCellArray{T_Phase,0}(undef,2,3)) - # @test typeof(@fill(@rand(3,4,eltype=Phase), 2,3, celldims=(3,4), eltype=Phase)) == typeof(MetalCellArray{T_Phase,0}(undef,2,3)) + Metal.allowscalar(true) + @test typeof(@rand(2,3, eltype=Phase)) == typeof(Metal.MtlArray(rand(Phase, 2,3))) + # @test typeof(@rand(2,3, celldims=(3,4), eltype=Phase)) == typeof(MtlCellArray{T_Phase,0}(undef,2,3)) # TODO fails because of bug in Metal.jl RNG implementation + @test typeof(@fill(solid, 2,3, eltype=Phase)) == typeof(Metal.MtlArray(rand(Phase, 2,3))) + @test typeof(@fill(solid, 2,3, celldims=(3,4), eltype=Phase)) == typeof(MtlCellArray{T_Phase,0}(undef,2,3)) + @test typeof(@fill(@rand(3,4,eltype=Phase), 2,3, celldims=(3,4), eltype=Phase)) == typeof(MtlCellArray{T_Phase,0}(undef,2,3)) + Metal.allowscalar(false) else @test typeof(@rand(2,3, eltype=Phase)) == typeof(rand(Phase, 2,3)) @test typeof(@rand(2,3, celldims=(3,4), eltype=Phase)) == typeof(CPUCellArray{T_Phase,1}(undef,2,3)) From 477f25c4a6a2280675076976333b5f25ac34b6be Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Thu, 10 Oct 2024 11:11:05 +0200 Subject: [PATCH 26/46] Check Sys if apple before importing Metal in tests --- Project.toml | 2 +- test/ParallelKernel/test_allocators.jl | 2 +- test/ParallelKernel/test_hide_communication.jl | 2 +- test/ParallelKernel/test_init_parallel_kernel.jl | 2 +- test/ParallelKernel/test_kernel_language.jl | 2 +- test/ParallelKernel/test_parallel.jl | 2 +- test/ParallelKernel/test_reset_parallel_kernel.jl | 2 +- test/test_FiniteDifferences1D.jl | 2 +- test/test_FiniteDifferences2D.jl | 2 +- test/test_FiniteDifferences3D.jl | 2 +- test/test_extensions.jl | 2 +- test/test_incremental_compilation.jl | 2 +- test/test_init_parallel_stencil.jl | 2 +- test/test_parallel.jl | 2 +- test/test_reset_parallel_stencil.jl | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Project.toml b/Project.toml index e7036d24..285c2828 100644 --- a/Project.toml +++ b/Project.toml @@ -28,7 +28,7 @@ CUDA = "3.12, 4, 5" CellArrays = "0.2.1" Enzyme = "0.11" MacroTools = "0.5" -Metal = "1.0" +Metal = "1" Polyester = "0.7" StaticArrays = "1" julia = "1.9" # Minimum version supporting extensions diff --git a/test/ParallelKernel/test_allocators.jl b/test/ParallelKernel/test_allocators.jl index 6f990b72..21fb40fe 100644 --- a/test/ParallelKernel/test_allocators.jl +++ b/test/ParallelKernel/test_allocators.jl @@ -17,7 +17,7 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end @define_ROCCellArray end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end @define_MtlCellArray diff --git a/test/ParallelKernel/test_hide_communication.jl b/test/ParallelKernel/test_hide_communication.jl index 6c7c7704..c43d93a0 100644 --- a/test/ParallelKernel/test_hide_communication.jl +++ b/test/ParallelKernel/test_hide_communication.jl @@ -14,7 +14,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/ParallelKernel/test_init_parallel_kernel.jl b/test/ParallelKernel/test_init_parallel_kernel.jl index fe4ab4b5..faf75887 100644 --- a/test/ParallelKernel/test_init_parallel_kernel.jl +++ b/test/ParallelKernel/test_init_parallel_kernel.jl @@ -14,7 +14,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl index 8cc48b37..17aa4262 100644 --- a/test/ParallelKernel/test_kernel_language.jl +++ b/test/ParallelKernel/test_kernel_language.jl @@ -14,7 +14,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/ParallelKernel/test_parallel.jl b/test/ParallelKernel/test_parallel.jl index 8a4d4538..dcab8970 100644 --- a/test/ParallelKernel/test_parallel.jl +++ b/test/ParallelKernel/test_parallel.jl @@ -16,7 +16,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/ParallelKernel/test_reset_parallel_kernel.jl b/test/ParallelKernel/test_reset_parallel_kernel.jl index 593a5e21..06938c20 100644 --- a/test/ParallelKernel/test_reset_parallel_kernel.jl +++ b/test/ParallelKernel/test_reset_parallel_kernel.jl @@ -12,7 +12,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_FiniteDifferences1D.jl b/test/test_FiniteDifferences1D.jl index 97753b2f..b610d620 100644 --- a/test/test_FiniteDifferences1D.jl +++ b/test/test_FiniteDifferences1D.jl @@ -12,7 +12,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_FiniteDifferences2D.jl b/test/test_FiniteDifferences2D.jl index 73dd0aea..6f853d6f 100644 --- a/test/test_FiniteDifferences2D.jl +++ b/test/test_FiniteDifferences2D.jl @@ -12,7 +12,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_FiniteDifferences3D.jl b/test/test_FiniteDifferences3D.jl index 844062f7..7a23c019 100644 --- a/test/test_FiniteDifferences3D.jl +++ b/test/test_FiniteDifferences3D.jl @@ -12,7 +12,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_extensions.jl b/test/test_extensions.jl index b9a47ec9..75e54466 100644 --- a/test/test_extensions.jl +++ b/test/test_extensions.jl @@ -10,7 +10,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_incremental_compilation.jl b/test/test_incremental_compilation.jl index 0a82ddf0..f0b49a9a 100644 --- a/test/test_incremental_compilation.jl +++ b/test/test_incremental_compilation.jl @@ -9,7 +9,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_init_parallel_stencil.jl b/test/test_init_parallel_stencil.jl index 6f8e168d..6c9559d4 100644 --- a/test/test_init_parallel_stencil.jl +++ b/test/test_init_parallel_stencil.jl @@ -13,7 +13,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_parallel.jl b/test/test_parallel.jl index dd434d26..0b021b51 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -15,7 +15,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_reset_parallel_stencil.jl b/test/test_reset_parallel_stencil.jl index d160537e..a5be1bdf 100644 --- a/test/test_reset_parallel_stencil.jl +++ b/test/test_reset_parallel_stencil.jl @@ -11,7 +11,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if PKG_METAL in TEST_PACKAGES +@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end From e440c853a5ff5f8812444c98d4660ccaef60e18d Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Thu, 10 Oct 2024 11:25:13 +0200 Subject: [PATCH 27/46] Fix compat for Metal to 1.2 or higher (restricted to v1) --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 322c8068..6432f766 100644 --- a/Project.toml +++ b/Project.toml @@ -28,7 +28,7 @@ CUDA = "3.12, 4, 5" CellArrays = "0.2.1" Enzyme = "0.11, 0.12, 0.13" MacroTools = "0.5" -Metal = "1" +Metal = "^1.2" Polyester = "0.7" StaticArrays = "1" julia = "1.9" # Minimum version supporting extensions From 176387d59abc23c33b5b95adbd9a2b9b2c922ce6 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Thu, 10 Oct 2024 11:33:18 +0200 Subject: [PATCH 28/46] Put more constraints with Sys.isapple --- src/ParallelKernel/MetalExt/shared.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ParallelKernel/MetalExt/shared.jl b/src/ParallelKernel/MetalExt/shared.jl index 8387dc37..8dcfc604 100644 --- a/src/ParallelKernel/MetalExt/shared.jl +++ b/src/ParallelKernel/MetalExt/shared.jl @@ -4,7 +4,7 @@ using ParallelStencil.ParallelKernel.Exceptions using Metal, CellArrays, StaticArrays import Metal.MTL -@define_MtlCellArray +@static if Sys.isapple() @define_MtlCellArray end ## FUNCTIONS TO CHECK EXTENSIONS SUPPORT ParallelStencil.ParallelKernel.is_loaded(::Val{:ParallelStencil_MetalExt}) = true From 82a358dab7a482811d8fb41e2b15bfb18b627e23 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Thu, 10 Oct 2024 11:34:36 +0200 Subject: [PATCH 29/46] Rollback --- src/ParallelKernel/MetalExt/shared.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ParallelKernel/MetalExt/shared.jl b/src/ParallelKernel/MetalExt/shared.jl index 8dcfc604..8387dc37 100644 --- a/src/ParallelKernel/MetalExt/shared.jl +++ b/src/ParallelKernel/MetalExt/shared.jl @@ -4,7 +4,7 @@ using ParallelStencil.ParallelKernel.Exceptions using Metal, CellArrays, StaticArrays import Metal.MTL -@static if Sys.isapple() @define_MtlCellArray end +@define_MtlCellArray ## FUNCTIONS TO CHECK EXTENSIONS SUPPORT ParallelStencil.ParallelKernel.is_loaded(::Val{:ParallelStencil_MetalExt}) = true From 1efcb4e13450222b1dcb37098dc25a36bb6acb20 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 23 Oct 2024 17:01:35 +0200 Subject: [PATCH 30/46] Update CellArrays version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 6432f766..00111086 100644 --- a/Project.toml +++ b/Project.toml @@ -25,7 +25,7 @@ ParallelStencil_MetalExt = "Metal" [compat] AMDGPU = "0.6, 0.7, 0.8, 0.9, 1" CUDA = "3.12, 4, 5" -CellArrays = "0.2.1" +CellArrays = "0.3.0" Enzyme = "0.11, 0.12, 0.13" MacroTools = "0.5" Metal = "^1.2" From 533cd10a9e75c0cdb0c6e859076abbd4bb0aa36d Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Mon, 28 Oct 2024 16:22:25 +0100 Subject: [PATCH 31/46] Fix test for Metal --- test/ParallelKernel/test_allocators.jl | 2 +- test/ParallelKernel/test_hide_communication.jl | 2 +- test/ParallelKernel/test_init_parallel_kernel.jl | 2 +- test/ParallelKernel/test_kernel_language.jl | 2 +- test/ParallelKernel/test_parallel.jl | 2 +- test/ParallelKernel/test_reset_parallel_kernel.jl | 2 +- test/test_FiniteDifferences1D.jl | 2 +- test/test_FiniteDifferences2D.jl | 2 +- test/test_FiniteDifferences3D.jl | 2 +- test/test_extensions.jl | 2 +- test/test_incremental_compilation.jl | 2 +- test/test_init_parallel_stencil.jl | 2 +- test/test_parallel.jl | 2 +- test/test_reset_parallel_stencil.jl | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/test/ParallelKernel/test_allocators.jl b/test/ParallelKernel/test_allocators.jl index c21c68a4..b58d3212 100644 --- a/test/ParallelKernel/test_allocators.jl +++ b/test/ParallelKernel/test_allocators.jl @@ -19,7 +19,7 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end @define_ROCCellArray end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end @define_MtlCellArray diff --git a/test/ParallelKernel/test_hide_communication.jl b/test/ParallelKernel/test_hide_communication.jl index c43d93a0..6c7c7704 100644 --- a/test/ParallelKernel/test_hide_communication.jl +++ b/test/ParallelKernel/test_hide_communication.jl @@ -14,7 +14,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/ParallelKernel/test_init_parallel_kernel.jl b/test/ParallelKernel/test_init_parallel_kernel.jl index e2e319c1..c200362c 100644 --- a/test/ParallelKernel/test_init_parallel_kernel.jl +++ b/test/ParallelKernel/test_init_parallel_kernel.jl @@ -14,7 +14,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl index 17aa4262..8cc48b37 100644 --- a/test/ParallelKernel/test_kernel_language.jl +++ b/test/ParallelKernel/test_kernel_language.jl @@ -14,7 +14,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/ParallelKernel/test_parallel.jl b/test/ParallelKernel/test_parallel.jl index 6593efc1..c2ab5856 100644 --- a/test/ParallelKernel/test_parallel.jl +++ b/test/ParallelKernel/test_parallel.jl @@ -16,7 +16,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/ParallelKernel/test_reset_parallel_kernel.jl b/test/ParallelKernel/test_reset_parallel_kernel.jl index ce1fc4f5..1f404c04 100644 --- a/test/ParallelKernel/test_reset_parallel_kernel.jl +++ b/test/ParallelKernel/test_reset_parallel_kernel.jl @@ -12,7 +12,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_FiniteDifferences1D.jl b/test/test_FiniteDifferences1D.jl index b610d620..97753b2f 100644 --- a/test/test_FiniteDifferences1D.jl +++ b/test/test_FiniteDifferences1D.jl @@ -12,7 +12,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_FiniteDifferences2D.jl b/test/test_FiniteDifferences2D.jl index 6f853d6f..73dd0aea 100644 --- a/test/test_FiniteDifferences2D.jl +++ b/test/test_FiniteDifferences2D.jl @@ -12,7 +12,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_FiniteDifferences3D.jl b/test/test_FiniteDifferences3D.jl index 7a23c019..844062f7 100644 --- a/test/test_FiniteDifferences3D.jl +++ b/test/test_FiniteDifferences3D.jl @@ -12,7 +12,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_extensions.jl b/test/test_extensions.jl index 75e54466..b9a47ec9 100644 --- a/test/test_extensions.jl +++ b/test/test_extensions.jl @@ -10,7 +10,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_incremental_compilation.jl b/test/test_incremental_compilation.jl index f0b49a9a..0a82ddf0 100644 --- a/test/test_incremental_compilation.jl +++ b/test/test_incremental_compilation.jl @@ -9,7 +9,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_init_parallel_stencil.jl b/test/test_init_parallel_stencil.jl index c3dc5ec6..c4ac67ee 100644 --- a/test/test_init_parallel_stencil.jl +++ b/test/test_init_parallel_stencil.jl @@ -13,7 +13,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_parallel.jl b/test/test_parallel.jl index ea6acb38..dc1009d2 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -15,7 +15,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end diff --git a/test/test_reset_parallel_stencil.jl b/test/test_reset_parallel_stencil.jl index a5be1bdf..d160537e 100644 --- a/test/test_reset_parallel_stencil.jl +++ b/test/test_reset_parallel_stencil.jl @@ -11,7 +11,7 @@ end import AMDGPU if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end -@static if Sys.isapple() && PKG_METAL in TEST_PACKAGES +@static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end From 527444bd8dd68cc4be91a1ab718492c318fd3c24 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Mon, 28 Oct 2024 16:34:34 +0100 Subject: [PATCH 32/46] Refactor harm macros to use `inv` function instead of division --- src/FiniteDifferences.jl | 56 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/FiniteDifferences.jl b/src/FiniteDifferences.jl index 4bddbadb..82e83072 100644 --- a/src/FiniteDifferences.jl +++ b/src/FiniteDifferences.jl @@ -55,7 +55,7 @@ macro d2(A) @expandargs(A); esc(:( ($A[$ixi+1] - $A[$ixi]) - ($A[$ixi] - macro all(A) @expandargs(A); esc(:( $A[$ix ] )) end macro inn(A) @expandargs(A); esc(:( $A[$ixi ] )) end macro av(A) @expandargs(A); esc(:(($A[$ix] + $A[$ix+1] )*0.5 )) end -macro harm(A) @expandargs(A); esc(:(2/(1/$A[$ix] + 1/$A[$ix+1]) )) end +macro harm(A) @expandargs(A); esc(:( inv(inv($A[$ix]) + inv($A[$ix+1]))*2.0 )) end macro maxloc(A) @expandargs(A); esc(:( max( max($A[$ixi-1], $A[$ixi+1]), $A[$ixi] ) )) end macro minloc(A) @expandargs(A); esc(:( min( min($A[$ixi-1], $A[$ixi+1]), $A[$ixi] ) )) end @@ -172,11 +172,11 @@ macro av_xa(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix+1,$iy ] )*0 macro av_ya(A) @expandargs(A); esc(:(($A[$ix ,$iy ] + $A[$ix ,$iy+1] )*0.5 )) end macro av_xi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ] + $A[$ix+1,$iyi ] )*0.5 )) end macro av_yi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ] + $A[$ixi ,$iy+1] )*0.5 )) end -macro harm(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iy ] + 1/$A[$ix+1,$iy ] + 1/$A[$ix,$iy+1] + 1/$A[$ix+1,$iy+1]) )) end -macro harm_xa(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iy ] + 1/$A[$ix+1,$iy ] ) )) end -macro harm_ya(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iy ] + 1/$A[$ix ,$iy+1] ) )) end -macro harm_xi(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iyi ] + 1/$A[$ix+1,$iyi ] ) )) end -macro harm_yi(A) @expandargs(A); esc(:(2/(1/$A[$ixi ,$iy ] + 1/$A[$ixi ,$iy+1] ) )) end +macro harm(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iy ]) + inv($A[$ix+1,$iy ]) + inv($A[$ix,$iy+1]) + inv($A[$ix+1,$iy+1]))*4.0 )) end +macro harm_xa(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iy ]) + inv($A[$ix+1,$iy ]))*2.0 )) end +macro harm_ya(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iy ]) + inv($A[$ix ,$iy+1]))*2.0 )) end +macro harm_xi(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iyi ]) + inv($A[$ix+1,$iyi ]))*2.0 )) end +macro harm_yi(A) @expandargs(A); esc(:( inv(inv($A[$ixi ,$iy ]) + inv($A[$ixi ,$iy+1]))*2.0 )) end macro maxloc(A) @expandargs(A); esc(:( max( max( max($A[$ixi-1,$iyi ], $A[$ixi+1,$iyi ]) , $A[$ixi ,$iyi ] ), max($A[$ixi ,$iyi-1], $A[$ixi ,$iyi+1]) ) )) end macro minloc(A) @expandargs(A); esc(:( min( min( min($A[$ixi-1,$iyi ], $A[$ixi+1,$iyi ]) , $A[$ixi ,$iyi ] ), @@ -361,28 +361,28 @@ macro av_xzi(A) @expandargs(A); esc(:(($A[$ix ,$iyi ,$iz ] + $A[$ix+1,$iyi $A[$ix ,$iyi ,$iz+1] + $A[$ix+1,$iyi ,$iz+1] )*0.25 )) end macro av_yzi(A) @expandargs(A); esc(:(($A[$ixi ,$iy ,$iz ] + $A[$ixi ,$iy+1,$iz ] + $A[$ixi ,$iy ,$iz+1] + $A[$ixi ,$iy+1,$iz+1] )*0.25 )) end -macro harm(A) @expandargs(A); esc(:(8/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + - 1/$A[$ix+1,$iy+1,$iz ] + 1/$A[$ix+1,$iy+1,$iz+1] + - 1/$A[$ix ,$iy+1,$iz+1] + 1/$A[$ix ,$iy ,$iz+1] + - 1/$A[$ix+1,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz ] ) )) end -macro harm_xa(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] ) )) end -macro harm_ya(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy+1,$iz ] ) )) end -macro harm_za(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy ,$iz+1] ) )) end -macro harm_xi(A) @expandargs(A); esc(:(2/(1/$A[$ix ,$iyi ,$izi ] + 1/$A[$ix+1,$iyi ,$izi ] ) )) end -macro harm_yi(A) @expandargs(A); esc(:(2/(1/$A[$ixi ,$iy ,$izi ] + 1/$A[$ixi ,$iy+1,$izi ] ) )) end -macro harm_zi(A) @expandargs(A); esc(:(2/(1/$A[$ixi ,$iyi ,$iz ] + 1/$A[$ixi ,$iyi ,$iz+1] ) )) end -macro harm_xya(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + - 1/$A[$ix ,$iy+1,$iz ] + 1/$A[$ix+1,$iy+1,$iz ] ) )) end -macro harm_xza(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix+1,$iy ,$iz ] + - 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix+1,$iy ,$iz+1] ) )) end -macro harm_yza(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iy ,$iz ] + 1/$A[$ix ,$iy+1,$iz ] + - 1/$A[$ix ,$iy ,$iz+1] + 1/$A[$ix ,$iy+1,$iz+1] ) )) end -macro harm_xyi(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iy ,$izi ] + 1/$A[$ix+1,$iy ,$izi ] + - 1/$A[$ix ,$iy+1,$izi ] + 1/$A[$ix+1,$iy+1,$izi ] ) )) end -macro harm_xzi(A) @expandargs(A); esc(:(4/(1/$A[$ix ,$iyi ,$iz ] + 1/$A[$ix+1,$iyi ,$iz ] + - 1/$A[$ix ,$iyi ,$iz+1] + 1/$A[$ix+1,$iyi ,$iz+1] ) )) end -macro harm_yzi(A) @expandargs(A); esc(:(4/(1/$A[$ixi ,$iy ,$iz ] + 1/$A[$ixi ,$iy+1,$iz ] + - 1/$A[$ixi ,$iy ,$iz+1] + 1/$A[$ixi ,$iy+1,$iz+1] ) )) end +macro harm(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iy ,$iz ]) + inv($A[$ix+1,$iy ,$iz ]) + + inv($A[$ix+1,$iy+1,$iz ]) + inv($A[$ix+1,$iy+1,$iz+1]) + + inv($A[$ix ,$iy+1,$iz+1]) + inv($A[$ix ,$iy ,$iz+1]) + + inv($A[$ix+1,$iy ,$iz+1]) + inv($A[$ix ,$iy+1,$iz ]) )*8.0 )) end +macro harm_xa(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iy ,$iz ]) + inv($A[$ix+1,$iy ,$iz ]) )*2.0 )) end +macro harm_ya(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iy ,$iz ]) + inv($A[$ix ,$iy+1,$iz ]) )*2.0 )) end +macro harm_za(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iy ,$iz ]) + inv($A[$ix ,$iy ,$iz+1]) )*2.0 )) end +macro harm_xi(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iyi ,$izi ]) + inv($A[$ix+1,$iyi ,$izi ]) )*2.0 )) end +macro harm_yi(A) @expandargs(A); esc(:( inv(inv($A[$ixi ,$iy ,$izi ]) + inv($A[$ixi ,$iy+1,$izi ]) )*2.0 )) end +macro harm_zi(A) @expandargs(A); esc(:( inv(inv($A[$ixi ,$iyi ,$iz ]) + inv($A[$ixi ,$iyi ,$iz+1]) )*2.0 )) end +macro harm_xya(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iy ,$iz ]) + inv($A[$ix+1,$iy ,$iz ]) + + inv($A[$ix ,$iy+1,$iz ]) + inv($A[$ix+1,$iy+1,$iz ]) )*4.0 )) end +macro harm_xza(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iy ,$iz ]) + inv($A[$ix+1,$iy ,$iz ]) + + inv($A[$ix ,$iy ,$iz+1]) + inv($A[$ix+1,$iy ,$iz+1]) )*4.0 )) end +macro harm_yza(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iy ,$iz ]) + inv($A[$ix ,$iy+1,$iz ]) + + inv($A[$ix ,$iy ,$iz+1]) + inv($A[$ix ,$iy+1,$iz+1]) )*4.0 )) end +macro harm_xyi(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iy ,$izi ]) + inv($A[$ix+1,$iy ,$izi ]) + + inv($A[$ix ,$iy+1,$izi ]) + inv($A[$ix+1,$iy+1,$izi ]) )*4.0 )) end +macro harm_xzi(A) @expandargs(A); esc(:( inv(inv($A[$ix ,$iyi ,$iz ]) + inv($A[$ix+1,$iyi ,$iz ]) + + inv($A[$ix ,$iyi ,$iz+1]) + inv($A[$ix+1,$iyi ,$iz+1]) )*4.0 )) end +macro harm_yzi(A) @expandargs(A); esc(:( inv(inv($A[$ixi ,$iy ,$iz ]) + inv($A[$ixi ,$iy+1,$iz ]) + + inv($A[$ixi ,$iy ,$iz+1]) + inv($A[$ixi ,$iy+1,$iz+1]) )*4.0 )) end macro maxloc(A) @expandargs(A); esc(:( max( max( max( max($A[$ixi-1,$iyi ,$izi ], $A[$ixi+1,$iyi ,$izi ]) , $A[$ixi ,$iyi ,$izi ] ), max($A[$ixi ,$iyi-1,$izi ], $A[$ixi ,$iyi+1,$izi ]) ), max($A[$ixi ,$iyi ,$izi-1], $A[$ixi ,$iyi ,$izi+1]) ) )) end From cc6e5ee76f8f34750030dbb41a369f0cf64059ee Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Mon, 28 Oct 2024 16:51:48 +0100 Subject: [PATCH 33/46] Rollback equality checks from approx to exact --- .../ParallelKernel/test_hide_communication.jl | 16 +-- test/ParallelKernel/test_parallel.jl | 22 ++-- test/test_FiniteDifferences1D.jl | 30 ++--- test/test_FiniteDifferences2D.jl | 66 +++++------ test/test_FiniteDifferences3D.jl | 108 +++++++++--------- test/test_parallel.jl | 90 +++++++-------- 6 files changed, 166 insertions(+), 166 deletions(-) diff --git a/test/ParallelKernel/test_hide_communication.jl b/test/ParallelKernel/test_hide_communication.jl index 6c7c7704..d018bc4c 100644 --- a/test/ParallelKernel/test_hide_communication.jl +++ b/test/ParallelKernel/test_hide_communication.jl @@ -94,7 +94,7 @@ eval(:( @parallel add_indices!(A); communication!(A); end - @test all(Array(A) .≈ communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .== communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width block" begin # This test verifies that the results are correct, even for CUDA.jl < v2.0, where it cannot overlap. A = @zeros(6, 7, 8) @@ -107,7 +107,7 @@ eval(:( communication_y!(A); communication_z!(A); end - @test all(Array(A) .≈ communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .== communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width block" begin A = @zeros(6, 7, 8) @@ -122,7 +122,7 @@ eval(:( communication_y!(A); communication_z!(A); end - @test all(Array(A) .≈ communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .== communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width computation_calls=2 block" begin A = @zeros(6, 7, 8) @@ -131,7 +131,7 @@ eval(:( @parallel add_indices2!(A); communication!(A); end - @test all(Array(A) .≈ communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .== communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width computation_calls=2 block" begin A = @zeros(6, 7, 8) @@ -140,7 +140,7 @@ eval(:( @parallel (1:6, 1:7, 1:8) add_indices2!(A); communication!(A); end - @test all(Array(A) .≈ communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .== communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width computation_calls=2 block" begin A = @zeros(6, 7, 8) @@ -149,7 +149,7 @@ eval(:( @parallel (1:6, 1:7, 1:8) add_indices2!(A); communication!(A); end - @test all(Array(A) .≈ communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .== communication!([2*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication boundary_width computation_calls=3 block" begin A = @zeros(6, 7, 8) @@ -159,7 +159,7 @@ eval(:( @parallel add_indices3!(A); communication!(A); end - @test all(Array(A) .≈ communication!([3*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .== communication!([3*(ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2)) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; @testset "@hide_communication ranges_outer ranges_inner block" begin A = @zeros(6, 7, 8) @@ -169,7 +169,7 @@ eval(:( @parallel add_indices!(A); communication!(A); end - @test all(Array(A) .≈ communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) + @test all(Array(A) .== communication!([ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)])) end; end; @reset_parallel_kernel() diff --git a/test/ParallelKernel/test_parallel.jl b/test/ParallelKernel/test_parallel.jl index c2ab5856..fcba1dbf 100644 --- a/test/ParallelKernel/test_parallel.jl +++ b/test/ParallelKernel/test_parallel.jl @@ -379,7 +379,7 @@ eval(:( return end @parallel write_indices!(A); - @test all(Array(A) .≈ [ix for ix=1:size(A,1)]) + @test all(Array(A) .== [ix for ix=1:size(A,1)]) end; @testset "@parallel_indices (2D)" begin A = @zeros(4, 5) @@ -388,7 +388,7 @@ eval(:( return end @parallel write_indices!(A); - @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) + @test all(Array(A) .== [ix + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) end; @testset "@parallel_indices (3D)" begin A = @zeros(4, 5, 6) @@ -397,7 +397,7 @@ eval(:( return end @parallel write_indices!(A); - @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end; @testset "@parallel_indices (1D in 3D)" begin A = @zeros(4, 5, 6) @@ -406,7 +406,7 @@ eval(:( return end @parallel 1:size(A,2) write_indices!(A); - @test all(Array(A)[1,:,1] .≈ [iy for iy=1:size(A,2)]) + @test all(Array(A)[1,:,1] .== [iy for iy=1:size(A,2)]) end; @testset "@parallel_indices (2D in 3D)" begin A = @zeros(4, 5, 6) @@ -415,7 +415,7 @@ eval(:( return end @parallel (1:size(A,1), 1:size(A,3)) write_indices!(A); - @test all(Array(A)[:,end,:] .≈ [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) + @test all(Array(A)[:,end,:] .== [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) end; @testset "@parallel_indices (2D in 3D with macro)" begin A = @zeros(4, 5, 6) @@ -424,7 +424,7 @@ eval(:( return end @parallel (1:size(A,1), 1:size(A,3)) write_indices!(A); - @test all(Array(A)[:,end,:] .≈ [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) + @test all(Array(A)[:,end,:] .== [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) end; @testset "@parallel_indices (2D in 3D with macro with aliases)" begin A = @zeros(4, 5, 6) @@ -433,7 +433,7 @@ eval(:( return end @parallel (1:size(A,1), 1:size(A,3)) write_indices!(A); - @test all(Array(A)[:,end,:] .≈ [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) + @test all(Array(A)[:,end,:] .== [ix + (iz-1)*size(A,1) for ix=1:size(A,1), iz=1:size(A,3)]) end; @static if $package != $PKG_POLYESTER @testset "nested function (long definition, array modification)" begin @@ -447,7 +447,7 @@ eval(:( return end @parallel write_indices!(A); - @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end; @testset "nested function (short definition, array modification)" begin A = @zeros(4, 5, 6) @@ -457,7 +457,7 @@ eval(:( return end @parallel write_indices!(A); - @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end; @testset "nested function (long definition, return value)" begin A = @zeros(4, 5, 6) @@ -469,7 +469,7 @@ eval(:( return end @parallel write_indices!(A); - @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end; @testset "nested function (short definition, return value)" begin A = @zeros(4, 5, 6) @@ -479,7 +479,7 @@ eval(:( return end @parallel write_indices!(A); - @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end; end end; diff --git a/test/test_FiniteDifferences1D.jl b/test/test_FiniteDifferences1D.jl index 97753b2f..63681bc0 100644 --- a/test/test_FiniteDifferences1D.jl +++ b/test/test_FiniteDifferences1D.jl @@ -41,44 +41,44 @@ eval(:( @testset "differences" begin @parallel d!(R, Ax) = (@all(R) = @d(Ax); return) @parallel d2!(R, Axx) = (@all(R) = @d2(Axx); return) - R.=0; @parallel d!(R, Ax); @test all(Array(R .≈ Ax[2:end].-Ax[1:end-1])) # INFO: AMDGPU arrays need to be compared on CPU - R.=0; @parallel d2!(R, Axx); @test all(Array(R .≈ (Axx[3:end].-Axx[2:end-1]).-(Axx[2:end-1].-Axx[1:end-2]))) + R.=0; @parallel d!(R, Ax); @test all(Array(R .== Ax[2:end].-Ax[1:end-1])) # INFO: AMDGPU arrays need to be compared on CPU + R.=0; @parallel d2!(R, Axx); @test all(Array(R .== (Axx[3:end].-Axx[2:end-1]).-(Axx[2:end-1].-Axx[1:end-2]))) end; @testset "selection" begin @parallel all!(R, A) = (@all(R) = @all(A); return) @parallel inn!(R, Axx) = (@all(R) = @inn(Axx); return) - R.=0; @parallel all!(R, A); @test all(Array(R .≈ A)) - R.=0; @parallel inn!(R, Axx); @test all(Array(R .≈ Axx[2:end-1])) + R.=0; @parallel all!(R, A); @test all(Array(R .== A)) + R.=0; @parallel inn!(R, Axx); @test all(Array(R .== Axx[2:end-1])) end; @testset "averages" begin @parallel av!(R, Ax) = (@all(R) = @av(Ax); return) - R.=0; @parallel av!(R, Ax); @test all(Array(R .≈ (Ax[1:end-1].+Ax[2:end])./2)) + R.=0; @parallel av!(R, Ax); @test all(Array(R .== (Ax[1:end-1].+Ax[2:end])./2)) end; @testset "harmonic averages" begin @parallel harm!(R, Ax) = (@all(R) = @harm(Ax); return) - R.=0; @parallel harm!(R, Ax); @test all(Array(R .≈ 2 ./(1 ./Ax[1:end-1].+1 ./Ax[2:end]))) + R.=0; @parallel harm!(R, Ax); @test all(Array(R .== 2 ./(1 ./Ax[1:end-1].+1 ./Ax[2:end]))) end; @testset "others" begin @parallel maxloc!(R, Axx) = (@all(R) = @maxloc(Axx); return) - R.=0; @parallel maxloc!(R, Axx); @test all(Array(R .≈ max.(max.(Axx[3:end],Axx[2:end-1]),Axx[1:end-2]))) + R.=0; @parallel maxloc!(R, Axx); @test all(Array(R .== max.(max.(Axx[3:end],Axx[2:end-1]),Axx[1:end-2]))) end; end; @testset "2. apply masks" begin @testset "selection" begin @parallel inn_all!(Rxx, A) = (@inn(Rxx) = @all(A); return) @parallel inn_inn!(Rxx, Axx) = (@inn(Rxx) = @inn(Axx); return) - Rxx.=0; @parallel inn_all!(Rxx, A); @test all(Array(Rxx[2:end-1] .≈ A)) - Rxx[2:end-1].=0; @test all(Array(Rxx .≈ 0)) # Test that boundary values remained zero. - Rxx.=0; @parallel inn_inn!(Rxx, Axx); @test all(Array(Rxx[2:end-1] .≈ Axx[2:end-1])) - Rxx[2:end-1].=0; @test all(Array(Rxx .≈ 0)) # Test that boundary values remained zero. + Rxx.=0; @parallel inn_all!(Rxx, A); @test all(Array(Rxx[2:end-1] .== A)) + Rxx[2:end-1].=0; @test all(Array(Rxx .== 0)) # Test that boundary values remained zero. + Rxx.=0; @parallel inn_inn!(Rxx, Axx); @test all(Array(Rxx[2:end-1] .== Axx[2:end-1])) + Rxx[2:end-1].=0; @test all(Array(Rxx .== 0)) # Test that boundary values remained zero. end; @testset "differences" begin @parallel inn_d!(Rxx, Ax) = (@inn(Rxx) = @d(Ax); return) @parallel inn_d2!(Rxx, Axx) = (@inn(Rxx) = @d2(Axx); return) - Rxx.=0; @parallel inn_d!(Rxx, Ax); @test all(Array(Rxx[2:end-1] .≈ Ax[2:end].-Ax[1:end-1])) - Rxx[2:end-1].=0; @test all(Array(Rxx .≈ 0)) # Test that boundary values remained zero. - Rxx.=0; @parallel inn_d2!(Rxx, Axx); @test all(Array(Rxx[2:end-1] .≈ (Axx[3:end].-Axx[2:end-1]).-(Axx[2:end-1].-Axx[1:end-2]))) - Rxx[2:end-1].=0; @test all(Array(Rxx .≈ 0)) # Test that boundary values remained zero. + Rxx.=0; @parallel inn_d!(Rxx, Ax); @test all(Array(Rxx[2:end-1] .== Ax[2:end].-Ax[1:end-1])) + Rxx[2:end-1].=0; @test all(Array(Rxx .== 0)) # Test that boundary values remained zero. + Rxx.=0; @parallel inn_d2!(Rxx, Axx); @test all(Array(Rxx[2:end-1] .== (Axx[3:end].-Axx[2:end-1]).-(Axx[2:end-1].-Axx[1:end-2]))) + Rxx[2:end-1].=0; @test all(Array(Rxx .== 0)) # Test that boundary values remained zero. end; end; @reset_parallel_stencil() diff --git a/test/test_FiniteDifferences2D.jl b/test/test_FiniteDifferences2D.jl index 73dd0aea..96f73a13 100644 --- a/test/test_FiniteDifferences2D.jl +++ b/test/test_FiniteDifferences2D.jl @@ -53,24 +53,24 @@ eval(:( @parallel d2_ya!(R, Ayy) = (@all(R) = @d2_ya(Ayy); return) @parallel d2_xi!(R, Axxyy) = (@all(R) = @d2_xi(Axxyy); return) @parallel d2_yi!(R, Axxyy) = (@all(R) = @d2_yi(Axxyy); return) - R.=0; @parallel d_xa!(R, Ax); @test all(Array(R .≈ Ax[2:end, :].-Ax[1:end-1, :])) - R.=0; @parallel d_ya!(R, Ay); @test all(Array(R .≈ Ay[ :,2:end].-Ay[ :,1:end-1])) - R.=0; @parallel d_xi!(R, Axyy); @test all(Array(R .≈ Axyy[2:end ,2:end-1].-Axyy[1:end-1,2:end-1])) - R.=0; @parallel d_yi!(R, Axxy); @test all(Array(R .≈ Axxy[2:end-1,2:end ].-Axxy[2:end-1,1:end-1])) - R.=0; @parallel d2_xa!(R, Axx); @test all(Array(R .≈ (Axx[3:end, :].-Axx[2:end-1, :]).-(Axx[2:end-1, :].-Axx[1:end-2, :]))) - R.=0; @parallel d2_ya!(R, Ayy); @test all(Array(R .≈ (Ayy[ :,3:end].-Ayy[ :,2:end-1]).-(Ayy[ :,2:end-1].-Ayy[ :,1:end-2]))) - R.=0; @parallel d2_xi!(R, Axxyy); @test all(Array(R .≈ (Axxyy[3:end,2:end-1].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[1:end-2,2:end-1]))) - R.=0; @parallel d2_yi!(R, Axxyy); @test all(Array(R .≈ (Axxyy[2:end-1,3:end].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[2:end-1,1:end-2]))) + R.=0; @parallel d_xa!(R, Ax); @test all(Array(R .== Ax[2:end, :].-Ax[1:end-1, :])) + R.=0; @parallel d_ya!(R, Ay); @test all(Array(R .== Ay[ :,2:end].-Ay[ :,1:end-1])) + R.=0; @parallel d_xi!(R, Axyy); @test all(Array(R .== Axyy[2:end ,2:end-1].-Axyy[1:end-1,2:end-1])) + R.=0; @parallel d_yi!(R, Axxy); @test all(Array(R .== Axxy[2:end-1,2:end ].-Axxy[2:end-1,1:end-1])) + R.=0; @parallel d2_xa!(R, Axx); @test all(Array(R .== (Axx[3:end, :].-Axx[2:end-1, :]).-(Axx[2:end-1, :].-Axx[1:end-2, :]))) + R.=0; @parallel d2_ya!(R, Ayy); @test all(Array(R .== (Ayy[ :,3:end].-Ayy[ :,2:end-1]).-(Ayy[ :,2:end-1].-Ayy[ :,1:end-2]))) + R.=0; @parallel d2_xi!(R, Axxyy); @test all(Array(R .== (Axxyy[3:end,2:end-1].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[1:end-2,2:end-1]))) + R.=0; @parallel d2_yi!(R, Axxyy); @test all(Array(R .== (Axxyy[2:end-1,3:end].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[2:end-1,1:end-2]))) end; @testset "selection" begin @parallel all!(R, A) = (@all(R) = @all(A); return) @parallel inn!(R, Axxyy) = (@all(R) = @inn(Axxyy); return) @parallel inn_x!(R, Axx) = (@all(R) = @inn_x(Axx); return) @parallel inn_y!(R, Ayy) = (@all(R) = @inn_y(Ayy); return) - R.=0; @parallel all!(R, A); @test all(Array(R .≈ A)) - R.=0; @parallel inn!(R, Axxyy); @test all(Array(R .≈ Axxyy[2:end-1,2:end-1])) - R.=0; @parallel inn_x!(R, Axx); @test all(Array(R .≈ Axx[2:end-1, :])) - R.=0; @parallel inn_y!(R, Ayy); @test all(Array(R .≈ Ayy[ :,2:end-1])) + R.=0; @parallel all!(R, A); @test all(Array(R .== A)) + R.=0; @parallel inn!(R, Axxyy); @test all(Array(R .== Axxyy[2:end-1,2:end-1])) + R.=0; @parallel inn_x!(R, Axx); @test all(Array(R .== Axx[2:end-1, :])) + R.=0; @parallel inn_y!(R, Ayy); @test all(Array(R .== Ayy[ :,2:end-1])) end; @testset "averages" begin @parallel av!(R, Axy) = (@all(R) = @av(Axy); return) @@ -78,11 +78,11 @@ eval(:( @parallel av_ya!(R, Ay) = (@all(R) = @av_ya(Ay); return) @parallel av_xi!(R, Axyy) = (@all(R) = @av_xi(Axyy); return) @parallel av_yi!(R, Axxy) = (@all(R) = @av_yi(Axxy); return) - R.=0; @parallel av!(R, Axy); @test all(Array(R .≈ (Axy[1:end-1,1:end-1].+Axy[2:end,1:end-1].+Axy[1:end-1,2:end].+Axy[2:end,2:end])./4)) - R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .≈ (Ax[2:end, :].+Ax[1:end-1, :])./2)) - R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .≈ (Ay[ :,2:end].+Ay[ :,1:end-1])./2)) - R.=0; @parallel av_xi!(R, Axyy); @test all(Array(R .≈ (Axyy[2:end ,2:end-1].+Axyy[1:end-1,2:end-1])./2)) - R.=0; @parallel av_yi!(R, Axxy); @test all(Array(R .≈ (Axxy[2:end-1,2:end ].+Axxy[2:end-1,1:end-1])./2)) + R.=0; @parallel av!(R, Axy); @test all(Array(R .== (Axy[1:end-1,1:end-1].+Axy[2:end,1:end-1].+Axy[1:end-1,2:end].+Axy[2:end,2:end])./4)) + R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .== (Ax[2:end, :].+Ax[1:end-1, :])./2)) + R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .== (Ay[ :,2:end].+Ay[ :,1:end-1])./2)) + R.=0; @parallel av_xi!(R, Axyy); @test all(Array(R .== (Axyy[2:end ,2:end-1].+Axyy[1:end-1,2:end-1])./2)) + R.=0; @parallel av_yi!(R, Axxy); @test all(Array(R .== (Axxy[2:end-1,2:end ].+Axxy[2:end-1,1:end-1])./2)) end; @testset "harmonic averages" begin @parallel harm!(R, Axy) = (@all(R) = @harm(Axy); return) @@ -90,36 +90,36 @@ eval(:( @parallel harm_ya!(R, Ay) = (@all(R) = @harm_ya(Ay); return) @parallel harm_xi!(R, Axyy) = (@all(R) = @harm_xi(Axyy); return) @parallel harm_yi!(R, Axxy) = (@all(R) = @harm_yi(Axxy); return) - R.=0; @parallel harm!(R, Axy); @test all(Array(R .≈ 4 ./(1 ./Axy[1:end-1,1:end-1].+1 ./Axy[2:end,1:end-1].+1 ./Axy[1:end-1,2:end].+1 ./Axy[2:end,2:end]))) - R.=0; @parallel harm_xa!(R, Ax); @test all(Array(R .≈ 2 ./(1 ./Ax[2:end, :].+1 ./Ax[1:end-1, :]))) - R.=0; @parallel harm_ya!(R, Ay); @test all(Array(R .≈ 2 ./(1 ./Ay[ :,2:end].+1 ./Ay[ :,1:end-1]))) - R.=0; @parallel harm_xi!(R, Axyy); @test all(Array(R .≈ 2 ./(1 ./Axyy[2:end ,2:end-1].+1 ./Axyy[1:end-1,2:end-1]))) - R.=0; @parallel harm_yi!(R, Axxy); @test all(Array(R .≈ 2 ./(1 ./Axxy[2:end-1,2:end ].+1 ./Axxy[2:end-1,1:end-1]))) + R.=0; @parallel harm!(R, Axy); @test all(Array(R .== 4 ./(1 ./Axy[1:end-1,1:end-1].+1 ./Axy[2:end,1:end-1].+1 ./Axy[1:end-1,2:end].+1 ./Axy[2:end,2:end]))) + R.=0; @parallel harm_xa!(R, Ax); @test all(Array(R .== 2 ./(1 ./Ax[2:end, :].+1 ./Ax[1:end-1, :]))) + R.=0; @parallel harm_ya!(R, Ay); @test all(Array(R .== 2 ./(1 ./Ay[ :,2:end].+1 ./Ay[ :,1:end-1]))) + R.=0; @parallel harm_xi!(R, Axyy); @test all(Array(R .== 2 ./(1 ./Axyy[2:end ,2:end-1].+1 ./Axyy[1:end-1,2:end-1]))) + R.=0; @parallel harm_yi!(R, Axxy); @test all(Array(R .== 2 ./(1 ./Axxy[2:end-1,2:end ].+1 ./Axxy[2:end-1,1:end-1]))) end; @testset "others" begin @parallel maxloc!(R, Axxyy) = (@all(R) = @maxloc(Axxyy); return) - R.=0; @parallel maxloc!(R, Axxyy); @test all(Array(R .≈ max.(max.(max.(max.(Axxyy[1:end-2,2:end-1],Axxyy[3:end,2:end-1]),Axxyy[2:end-1,2:end-1]),Axxyy[2:end-1,1:end-2]),Axxyy[2:end-1,3:end]))) + R.=0; @parallel maxloc!(R, Axxyy); @test all(Array(R .== max.(max.(max.(max.(Axxyy[1:end-2,2:end-1],Axxyy[3:end,2:end-1]),Axxyy[2:end-1,2:end-1]),Axxyy[2:end-1,1:end-2]),Axxyy[2:end-1,3:end]))) end; end; @testset "2. apply masks" begin @testset "selection" begin @parallel inn_all!(Rxxyy, A) = (@inn(Rxxyy) = @all(A); return) @parallel inn_inn!(Rxxyy, Axxyy) = (@inn(Rxxyy) = @inn(Axxyy); return) - Rxxyy.=0; @parallel inn_all!(Rxxyy, A); @test all(Array(Rxxyy[2:end-1,2:end-1] .≈ A)) - Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .≈ 0)) # Test that boundary values remained zero. - Rxxyy.=0; @parallel inn_inn!(Rxxyy, Axxyy); @test all(Array(Rxxyy[2:end-1,2:end-1] .≈ Axxyy[2:end-1,2:end-1])) - Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .≈ 0)) # Test that boundary values remained zero. + Rxxyy.=0; @parallel inn_all!(Rxxyy, A); @test all(Array(Rxxyy[2:end-1,2:end-1] .== A)) + Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .== 0)) # Test that boundary values remained zero. + Rxxyy.=0; @parallel inn_inn!(Rxxyy, Axxyy); @test all(Array(Rxxyy[2:end-1,2:end-1] .== Axxyy[2:end-1,2:end-1])) + Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .== 0)) # Test that boundary values remained zero. end; @testset "differences" begin @parallel inn_d_xa!(Rxxyy, Ax) = (@inn(Rxxyy) = @d_xa(Ax); return) @parallel inn_d_yi!(Rxxyy, Axxy) = (@inn(Rxxyy) = @d_yi(Axxy); return) @parallel inn_d2_yi!(Rxxyy, Axxyy) = (@inn(Rxxyy) = @d2_yi(Axxyy); return) - Rxxyy.=0; @parallel inn_d_xa!(Rxxyy, Ax); @test all(Array(Rxxyy[2:end-1,2:end-1] .≈ Ax[2:end, :].-Ax[1:end-1, :])) - Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .≈ 0)) # Test that boundary values remained zero. - Rxxyy.=0; @parallel inn_d_yi!(Rxxyy, Axxy); @test all(Array(Rxxyy[2:end-1,2:end-1] .≈ Axxy[2:end-1,2:end ].-Axxy[2:end-1,1:end-1])) - Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .≈ 0)) # Test that boundary values remained zero. - Rxxyy.=0; @parallel inn_d2_yi!(Rxxyy, Axxyy); @test all(Array(Rxxyy[2:end-1,2:end-1] .≈ (Axxyy[2:end-1,3:end].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[2:end-1,1:end-2]))) - Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .≈ 0)) # Test that boundary values remained zero. + Rxxyy.=0; @parallel inn_d_xa!(Rxxyy, Ax); @test all(Array(Rxxyy[2:end-1,2:end-1] .== Ax[2:end, :].-Ax[1:end-1, :])) + Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .== 0)) # Test that boundary values remained zero. + Rxxyy.=0; @parallel inn_d_yi!(Rxxyy, Axxy); @test all(Array(Rxxyy[2:end-1,2:end-1] .== Axxy[2:end-1,2:end ].-Axxy[2:end-1,1:end-1])) + Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .== 0)) # Test that boundary values remained zero. + Rxxyy.=0; @parallel inn_d2_yi!(Rxxyy, Axxyy); @test all(Array(Rxxyy[2:end-1,2:end-1] .== (Axxyy[2:end-1,3:end].-Axxyy[2:end-1,2:end-1]).-(Axxyy[2:end-1,2:end-1].-Axxyy[2:end-1,1:end-2]))) + Rxxyy[2:end-1,2:end-1].=0; @test all(Array(Rxxyy .== 0)) # Test that boundary values remained zero. end; end; @reset_parallel_stencil() diff --git a/test/test_FiniteDifferences3D.jl b/test/test_FiniteDifferences3D.jl index 844062f7..5ec92b90 100644 --- a/test/test_FiniteDifferences3D.jl +++ b/test/test_FiniteDifferences3D.jl @@ -66,15 +66,15 @@ eval(:( @parallel d2_xi!(R, Axxyyzz) = (@all(R) = @d2_xi(Axxyyzz); return) @parallel d2_yi!(R, Axxyyzz) = (@all(R) = @d2_yi(Axxyyzz); return) @parallel d2_zi!(R, Axxyyzz) = (@all(R) = @d2_zi(Axxyyzz); return) - R.=0; @parallel d_xa!(R, Ax); @test all(Array(R .≈ Ax[2:end, :, :].-Ax[1:end-1, :, :])) - R.=0; @parallel d_ya!(R, Ay); @test all(Array(R .≈ Ay[ :,2:end, :].-Ay[ :,1:end-1, :])) - R.=0; @parallel d_za!(R, Az); @test all(Array(R .≈ Az[ :, :,2:end].-Az[ :, :,1:end-1])) - R.=0; @parallel d_xi!(R, Axyyzz); @test all(Array(R .≈ Axyyzz[2:end ,2:end-1,2:end-1].-Axyyzz[1:end-1,2:end-1,2:end-1])) - R.=0; @parallel d_yi!(R, Axxyzz); @test all(Array(R .≈ Axxyzz[2:end-1,2:end ,2:end-1].-Axxyzz[2:end-1,1:end-1,2:end-1])) - R.=0; @parallel d_zi!(R, Axxyyz); @test all(Array(R .≈ Axxyyz[2:end-1,2:end-1,2:end ].-Axxyyz[2:end-1,2:end-1,1:end-1])) - R.=0; @parallel d2_xi!(R, Axxyyzz); @test all(Array(R .≈ (Axxyyzz[3:end,2:end-1,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[1:end-2,2:end-1,2:end-1]))) - R.=0; @parallel d2_yi!(R, Axxyyzz); @test all(Array(R .≈ (Axxyyzz[2:end-1,3:end,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,1:end-2,2:end-1]))) - R.=0; @parallel d2_zi!(R, Axxyyzz); @test all(Array(R .≈ (Axxyyzz[2:end-1,2:end-1,3:end].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,2:end-1,1:end-2]))) + R.=0; @parallel d_xa!(R, Ax); @test all(Array(R .== Ax[2:end, :, :].-Ax[1:end-1, :, :])) + R.=0; @parallel d_ya!(R, Ay); @test all(Array(R .== Ay[ :,2:end, :].-Ay[ :,1:end-1, :])) + R.=0; @parallel d_za!(R, Az); @test all(Array(R .== Az[ :, :,2:end].-Az[ :, :,1:end-1])) + R.=0; @parallel d_xi!(R, Axyyzz); @test all(Array(R .== Axyyzz[2:end ,2:end-1,2:end-1].-Axyyzz[1:end-1,2:end-1,2:end-1])) + R.=0; @parallel d_yi!(R, Axxyzz); @test all(Array(R .== Axxyzz[2:end-1,2:end ,2:end-1].-Axxyzz[2:end-1,1:end-1,2:end-1])) + R.=0; @parallel d_zi!(R, Axxyyz); @test all(Array(R .== Axxyyz[2:end-1,2:end-1,2:end ].-Axxyyz[2:end-1,2:end-1,1:end-1])) + R.=0; @parallel d2_xi!(R, Axxyyzz); @test all(Array(R .== (Axxyyzz[3:end,2:end-1,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[1:end-2,2:end-1,2:end-1]))) + R.=0; @parallel d2_yi!(R, Axxyyzz); @test all(Array(R .== (Axxyyzz[2:end-1,3:end,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,1:end-2,2:end-1]))) + R.=0; @parallel d2_zi!(R, Axxyyzz); @test all(Array(R .== (Axxyyzz[2:end-1,2:end-1,3:end].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,2:end-1,1:end-2]))) end; @testset "selection" begin @parallel all!(R, A) = (@all(R) = @all(A); return) @@ -85,14 +85,14 @@ eval(:( @parallel inn_xy!(R, Axxyy) = (@all(R) = @inn_xy(Axxyy); return) @parallel inn_xz!(R, Axxzz) = (@all(R) = @inn_xz(Axxzz); return) @parallel inn_yz!(R, Ayyzz) = (@all(R) = @inn_yz(Ayyzz); return) - R.=0; @parallel all!(R, A); @test all(Array(R .≈ A)) - R.=0; @parallel inn!(R, Axxyyzz); @test all(Array(R .≈ Axxyyzz[2:end-1,2:end-1,2:end-1])) - R.=0; @parallel inn_x!(R, Axx); @test all(Array(R .≈ Axx[2:end-1, :, :])) - R.=0; @parallel inn_y!(R, Ayy); @test all(Array(R .≈ Ayy[ :,2:end-1, :])) - R.=0; @parallel inn_z!(R, Azz); @test all(Array(R .≈ Azz[ :, :,2:end-1])) - R.=0; @parallel inn_xy!(R, Axxyy); @test all(Array(R .≈ Axxyy[2:end-1,2:end-1, :])) - R.=0; @parallel inn_xz!(R, Axxzz); @test all(Array(R .≈ Axxzz[2:end-1, :,2:end-1])) - R.=0; @parallel inn_yz!(R, Ayyzz); @test all(Array(R .≈ Ayyzz[ :,2:end-1,2:end-1])) + R.=0; @parallel all!(R, A); @test all(Array(R .== A)) + R.=0; @parallel inn!(R, Axxyyzz); @test all(Array(R .== Axxyyzz[2:end-1,2:end-1,2:end-1])) + R.=0; @parallel inn_x!(R, Axx); @test all(Array(R .== Axx[2:end-1, :, :])) + R.=0; @parallel inn_y!(R, Ayy); @test all(Array(R .== Ayy[ :,2:end-1, :])) + R.=0; @parallel inn_z!(R, Azz); @test all(Array(R .== Azz[ :, :,2:end-1])) + R.=0; @parallel inn_xy!(R, Axxyy); @test all(Array(R .== Axxyy[2:end-1,2:end-1, :])) + R.=0; @parallel inn_xz!(R, Axxzz); @test all(Array(R .== Axxzz[2:end-1, :,2:end-1])) + R.=0; @parallel inn_yz!(R, Ayyzz); @test all(Array(R .== Ayyzz[ :,2:end-1,2:end-1])) end; @testset "averages" begin @parallel av!(R, Axyz) = (@all(R) = @av(Axyz); return) @@ -108,19 +108,19 @@ eval(:( @parallel av_xyi!(R, Axyzz) = (@all(R) = @av_xyi(Axyzz); return) @parallel av_xzi!(R, Axyyz) = (@all(R) = @av_xzi(Axyyz); return) @parallel av_yzi!(R, Axxyz) = (@all(R) = @av_yzi(Axxyz); return) - R.=0; @parallel av!(R, Axyz); @test all(Array(R .≈ (Axyz[1:end-1,1:end-1,1:end-1].+Axyz[2:end,1:end-1,1:end-1].+Axyz[2:end,2:end,1:end-1].+Axyz[2:end,2:end,2:end].+Axyz[1:end-1,2:end,2:end].+Axyz[1:end-1,1:end-1,2:end].+Axyz[2:end,1:end-1,2:end].+Axyz[1:end-1,2:end,1:end-1])./8)) - R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .≈ (Ax[2:end, :, :].+Ax[1:end-1, :, :])./2)) - R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .≈ (Ay[ :,2:end, :].+Ay[ :,1:end-1, :])./2)) - R.=0; @parallel av_za!(R, Az); @test all(Array(R .≈ (Az[ :, :,2:end].+Az[ :, :,1:end-1])./2)) - R.=0; @parallel av_xi!(R, Axyyzz); @test all(Array(R .≈ (Axyyzz[2:end ,2:end-1,2:end-1].+Axyyzz[1:end-1,2:end-1,2:end-1])./2)) - R.=0; @parallel av_yi!(R, Axxyzz); @test all(Array(R .≈ (Axxyzz[2:end-1,2:end ,2:end-1].+Axxyzz[2:end-1,1:end-1,2:end-1])./2)) - R.=0; @parallel av_zi!(R, Axxyyz); @test all(Array(R .≈ (Axxyyz[2:end-1,2:end-1,2:end ].+Axxyyz[2:end-1,2:end-1,1:end-1])./2)) - R.=0; @parallel av_xya!(R, Axy); @test all(Array(R .≈ (Axy[1:end-1,1:end-1,:].+Axy[2:end,1:end-1,:].+Axy[1:end-1,2:end,:].+Axy[2:end,2:end,:])./4)) - R.=0; @parallel av_xza!(R, Axz); @test all(Array(R .≈ (Axz[1:end-1,:,1:end-1].+Axz[2:end,:,1:end-1].+Axz[1:end-1,:,2:end].+Axz[2:end,:,2:end])./4)) - R.=0; @parallel av_yza!(R, Ayz); @test all(Array(R .≈ (Ayz[:,1:end-1,1:end-1].+Ayz[:,2:end,1:end-1].+Ayz[:,1:end-1,2:end].+Ayz[:,2:end,2:end])./4)) - R.=0; @parallel av_xyi!(R, Axyzz); @test all(Array(R .≈ (Axyzz[1:end-1,1:end-1,2:end-1].+Axyzz[2:end,1:end-1,2:end-1].+Axyzz[1:end-1,2:end,2:end-1].+Axyzz[2:end,2:end,2:end-1])./4)) - R.=0; @parallel av_xzi!(R, Axyyz); @test all(Array(R .≈ (Axyyz[1:end-1,2:end-1,1:end-1].+Axyyz[2:end,2:end-1,1:end-1].+Axyyz[1:end-1,2:end-1,2:end].+Axyyz[2:end,2:end-1,2:end])./4)) - R.=0; @parallel av_yzi!(R, Axxyz); @test all(Array(R .≈ (Axxyz[2:end-1,1:end-1,1:end-1].+Axxyz[2:end-1,2:end,1:end-1].+Axxyz[2:end-1,1:end-1,2:end].+Axxyz[2:end-1,2:end,2:end])./4)) + R.=0; @parallel av!(R, Axyz); @test all(Array(R .== (Axyz[1:end-1,1:end-1,1:end-1].+Axyz[2:end,1:end-1,1:end-1].+Axyz[2:end,2:end,1:end-1].+Axyz[2:end,2:end,2:end].+Axyz[1:end-1,2:end,2:end].+Axyz[1:end-1,1:end-1,2:end].+Axyz[2:end,1:end-1,2:end].+Axyz[1:end-1,2:end,1:end-1])./8)) + R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .== (Ax[2:end, :, :].+Ax[1:end-1, :, :])./2)) + R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .== (Ay[ :,2:end, :].+Ay[ :,1:end-1, :])./2)) + R.=0; @parallel av_za!(R, Az); @test all(Array(R .== (Az[ :, :,2:end].+Az[ :, :,1:end-1])./2)) + R.=0; @parallel av_xi!(R, Axyyzz); @test all(Array(R .== (Axyyzz[2:end ,2:end-1,2:end-1].+Axyyzz[1:end-1,2:end-1,2:end-1])./2)) + R.=0; @parallel av_yi!(R, Axxyzz); @test all(Array(R .== (Axxyzz[2:end-1,2:end ,2:end-1].+Axxyzz[2:end-1,1:end-1,2:end-1])./2)) + R.=0; @parallel av_zi!(R, Axxyyz); @test all(Array(R .== (Axxyyz[2:end-1,2:end-1,2:end ].+Axxyyz[2:end-1,2:end-1,1:end-1])./2)) + R.=0; @parallel av_xya!(R, Axy); @test all(Array(R .== (Axy[1:end-1,1:end-1,:].+Axy[2:end,1:end-1,:].+Axy[1:end-1,2:end,:].+Axy[2:end,2:end,:])./4)) + R.=0; @parallel av_xza!(R, Axz); @test all(Array(R .== (Axz[1:end-1,:,1:end-1].+Axz[2:end,:,1:end-1].+Axz[1:end-1,:,2:end].+Axz[2:end,:,2:end])./4)) + R.=0; @parallel av_yza!(R, Ayz); @test all(Array(R .== (Ayz[:,1:end-1,1:end-1].+Ayz[:,2:end,1:end-1].+Ayz[:,1:end-1,2:end].+Ayz[:,2:end,2:end])./4)) + R.=0; @parallel av_xyi!(R, Axyzz); @test all(Array(R .== (Axyzz[1:end-1,1:end-1,2:end-1].+Axyzz[2:end,1:end-1,2:end-1].+Axyzz[1:end-1,2:end,2:end-1].+Axyzz[2:end,2:end,2:end-1])./4)) + R.=0; @parallel av_xzi!(R, Axyyz); @test all(Array(R .== (Axyyz[1:end-1,2:end-1,1:end-1].+Axyyz[2:end,2:end-1,1:end-1].+Axyyz[1:end-1,2:end-1,2:end].+Axyyz[2:end,2:end-1,2:end])./4)) + R.=0; @parallel av_yzi!(R, Axxyz); @test all(Array(R .== (Axxyz[2:end-1,1:end-1,1:end-1].+Axxyz[2:end-1,2:end,1:end-1].+Axxyz[2:end-1,1:end-1,2:end].+Axxyz[2:end-1,2:end,2:end])./4)) end; @testset "harmonic averages" begin @parallel harm!(R, Axyz) = (@all(R) = @harm(Axyz); return) @@ -136,44 +136,44 @@ eval(:( @parallel harm_xyi!(R, Axyzz) = (@all(R) = @harm_xyi(Axyzz); return) @parallel harm_xzi!(R, Axyyz) = (@all(R) = @harm_xzi(Axyyz); return) @parallel harm_yzi!(R, Axxyz) = (@all(R) = @harm_yzi(Axxyz); return) - R.=0; @parallel harm!(R, Axyz); @test all(Array(R .≈ 8 ./(1 ./Axyz[1:end-1,1:end-1,1:end-1].+1 ./Axyz[2:end,1:end-1,1:end-1].+1 ./Axyz[2:end,2:end,1:end-1].+1 ./Axyz[2:end,2:end,2:end].+1 ./Axyz[1:end-1,2:end,2:end].+1 ./Axyz[1:end-1,1:end-1,2:end].+1 ./Axyz[2:end,1:end-1,2:end].+1 ./Axyz[1:end-1,2:end,1:end-1]) )) - R.=0; @parallel harm_xa!(R, Ax); @test all(Array(R .≈ 2 ./(1 ./Ax[2:end, :, :].+1 ./Ax[1:end-1, :, :]))) - R.=0; @parallel harm_ya!(R, Ay); @test all(Array(R .≈ 2 ./(1 ./Ay[ :,2:end, :].+1 ./Ay[ :,1:end-1, :]))) - R.=0; @parallel harm_za!(R, Az); @test all(Array(R .≈ 2 ./(1 ./Az[ :, :,2:end].+1 ./Az[ :, :,1:end-1]))) - R.=0; @parallel harm_xi!(R, Axyyzz); @test all(Array(R .≈ 2 ./(1 ./Axyyzz[2:end ,2:end-1,2:end-1].+1 ./Axyyzz[1:end-1,2:end-1,2:end-1]))) - R.=0; @parallel harm_yi!(R, Axxyzz); @test all(Array(R .≈ 2 ./(1 ./Axxyzz[2:end-1,2:end ,2:end-1].+1 ./Axxyzz[2:end-1,1:end-1,2:end-1]))) - R.=0; @parallel harm_zi!(R, Axxyyz); @test all(Array(R .≈ 2 ./(1 ./Axxyyz[2:end-1,2:end-1,2:end ].+1 ./Axxyyz[2:end-1,2:end-1,1:end-1]))) - R.=0; @parallel harm_xya!(R, Axy); @test all(Array(R .≈ 4 ./(1 ./Axy[1:end-1,1:end-1,:].+1 ./Axy[2:end,1:end-1,:].+1 ./Axy[1:end-1,2:end,:].+1 ./Axy[2:end,2:end,:]))) - R.=0; @parallel harm_xza!(R, Axz); @test all(Array(R .≈ 4 ./(1 ./Axz[1:end-1,:,1:end-1].+1 ./Axz[2:end,:,1:end-1].+1 ./Axz[1:end-1,:,2:end].+1 ./Axz[2:end,:,2:end]))) - R.=0; @parallel harm_yza!(R, Ayz); @test all(Array(R .≈ 4 ./(1 ./Ayz[:,1:end-1,1:end-1].+1 ./Ayz[:,2:end,1:end-1].+1 ./Ayz[:,1:end-1,2:end].+1 ./Ayz[:,2:end,2:end]))) - R.=0; @parallel harm_xyi!(R, Axyzz); @test all(Array(R .≈ 4 ./(1 ./Axyzz[1:end-1,1:end-1,2:end-1].+1 ./Axyzz[2:end,1:end-1,2:end-1].+1 ./Axyzz[1:end-1,2:end,2:end-1].+1 ./Axyzz[2:end,2:end,2:end-1]))) - R.=0; @parallel harm_xzi!(R, Axyyz); @test all(Array(R .≈ 4 ./(1 ./Axyyz[1:end-1,2:end-1,1:end-1].+1 ./Axyyz[2:end,2:end-1,1:end-1].+1 ./Axyyz[1:end-1,2:end-1,2:end].+1 ./Axyyz[2:end,2:end-1,2:end]))) - R.=0; @parallel harm_yzi!(R, Axxyz); @test all(Array(R .≈ 4 ./(1 ./Axxyz[2:end-1,1:end-1,1:end-1].+1 ./Axxyz[2:end-1,2:end,1:end-1].+1 ./Axxyz[2:end-1,1:end-1,2:end].+1 ./Axxyz[2:end-1,2:end,2:end]))) + R.=0; @parallel harm!(R, Axyz); @test all(Array(R .== 8 ./(1 ./Axyz[1:end-1,1:end-1,1:end-1].+1 ./Axyz[2:end,1:end-1,1:end-1].+1 ./Axyz[2:end,2:end,1:end-1].+1 ./Axyz[2:end,2:end,2:end].+1 ./Axyz[1:end-1,2:end,2:end].+1 ./Axyz[1:end-1,1:end-1,2:end].+1 ./Axyz[2:end,1:end-1,2:end].+1 ./Axyz[1:end-1,2:end,1:end-1]) )) + R.=0; @parallel harm_xa!(R, Ax); @test all(Array(R .== 2 ./(1 ./Ax[2:end, :, :].+1 ./Ax[1:end-1, :, :]))) + R.=0; @parallel harm_ya!(R, Ay); @test all(Array(R .== 2 ./(1 ./Ay[ :,2:end, :].+1 ./Ay[ :,1:end-1, :]))) + R.=0; @parallel harm_za!(R, Az); @test all(Array(R .== 2 ./(1 ./Az[ :, :,2:end].+1 ./Az[ :, :,1:end-1]))) + R.=0; @parallel harm_xi!(R, Axyyzz); @test all(Array(R .== 2 ./(1 ./Axyyzz[2:end ,2:end-1,2:end-1].+1 ./Axyyzz[1:end-1,2:end-1,2:end-1]))) + R.=0; @parallel harm_yi!(R, Axxyzz); @test all(Array(R .== 2 ./(1 ./Axxyzz[2:end-1,2:end ,2:end-1].+1 ./Axxyzz[2:end-1,1:end-1,2:end-1]))) + R.=0; @parallel harm_zi!(R, Axxyyz); @test all(Array(R .== 2 ./(1 ./Axxyyz[2:end-1,2:end-1,2:end ].+1 ./Axxyyz[2:end-1,2:end-1,1:end-1]))) + R.=0; @parallel harm_xya!(R, Axy); @test all(Array(R .== 4 ./(1 ./Axy[1:end-1,1:end-1,:].+1 ./Axy[2:end,1:end-1,:].+1 ./Axy[1:end-1,2:end,:].+1 ./Axy[2:end,2:end,:]))) + R.=0; @parallel harm_xza!(R, Axz); @test all(Array(R .== 4 ./(1 ./Axz[1:end-1,:,1:end-1].+1 ./Axz[2:end,:,1:end-1].+1 ./Axz[1:end-1,:,2:end].+1 ./Axz[2:end,:,2:end]))) + R.=0; @parallel harm_yza!(R, Ayz); @test all(Array(R .== 4 ./(1 ./Ayz[:,1:end-1,1:end-1].+1 ./Ayz[:,2:end,1:end-1].+1 ./Ayz[:,1:end-1,2:end].+1 ./Ayz[:,2:end,2:end]))) + R.=0; @parallel harm_xyi!(R, Axyzz); @test all(Array(R .== 4 ./(1 ./Axyzz[1:end-1,1:end-1,2:end-1].+1 ./Axyzz[2:end,1:end-1,2:end-1].+1 ./Axyzz[1:end-1,2:end,2:end-1].+1 ./Axyzz[2:end,2:end,2:end-1]))) + R.=0; @parallel harm_xzi!(R, Axyyz); @test all(Array(R .== 4 ./(1 ./Axyyz[1:end-1,2:end-1,1:end-1].+1 ./Axyyz[2:end,2:end-1,1:end-1].+1 ./Axyyz[1:end-1,2:end-1,2:end].+1 ./Axyyz[2:end,2:end-1,2:end]))) + R.=0; @parallel harm_yzi!(R, Axxyz); @test all(Array(R .== 4 ./(1 ./Axxyz[2:end-1,1:end-1,1:end-1].+1 ./Axxyz[2:end-1,2:end,1:end-1].+1 ./Axxyz[2:end-1,1:end-1,2:end].+1 ./Axxyz[2:end-1,2:end,2:end]))) end; @testset "others" begin @parallel maxloc!(R, Axxyyzz) = (@all(R) = @maxloc(Axxyyzz); return) - R.=0; @parallel maxloc!(R, Axxyyzz); @test all(Array(R .≈ max.(max.(max.(max.(max.(max.(Axxyyzz[1:end-2,2:end-1,2:end-1],Axxyyzz[3:end,2:end-1,2:end-1]),Axxyyzz[2:end-1,2:end-1,2:end-1]),Axxyyzz[2:end-1,1:end-2,2:end-1]),Axxyyzz[2:end-1,3:end,2:end-1]),Axxyyzz[2:end-1,2:end-1,1:end-2]),Axxyyzz[2:end-1,2:end-1,3:end]))) + R.=0; @parallel maxloc!(R, Axxyyzz); @test all(Array(R .== max.(max.(max.(max.(max.(max.(Axxyyzz[1:end-2,2:end-1,2:end-1],Axxyyzz[3:end,2:end-1,2:end-1]),Axxyyzz[2:end-1,2:end-1,2:end-1]),Axxyyzz[2:end-1,1:end-2,2:end-1]),Axxyyzz[2:end-1,3:end,2:end-1]),Axxyyzz[2:end-1,2:end-1,1:end-2]),Axxyyzz[2:end-1,2:end-1,3:end]))) end; end; @testset "2. apply masks" begin @testset "selection" begin @parallel inn_all!(Rxxyyzz, A) = (@inn(Rxxyyzz) = @all(A); return) @parallel inn_inn!(Rxxyyzz, Axxyyzz) = (@inn(Rxxyyzz) = @inn(Axxyyzz); return) - Rxxyyzz.=0; @parallel inn_all!(Rxxyyzz, A); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .≈ A)) - Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .≈ 0)) # Test that boundary values remained zero. - Rxxyyzz.=0; @parallel inn_inn!(Rxxyyzz, Axxyyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .≈ Axxyyzz[2:end-1,2:end-1,2:end-1])) - Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .≈ 0)) # Test that boundary values remained zero. + Rxxyyzz.=0; @parallel inn_all!(Rxxyyzz, A); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .== A)) + Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .== 0)) # Test that boundary values remained zero. + Rxxyyzz.=0; @parallel inn_inn!(Rxxyyzz, Axxyyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .== Axxyyzz[2:end-1,2:end-1,2:end-1])) + Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .== 0)) # Test that boundary values remained zero. end; @testset "differences" begin @parallel inn_d_xa!(Rxxyyzz, Ax) = (@inn(Rxxyyzz) = @d_xa(Ax); return) @parallel inn_d_yi!(Rxxyyzz, Axxyzz) = (@inn(Rxxyyzz) = @d_yi(Axxyzz); return) @parallel inn_d2_yi!(Rxxyyzz, Axxyyzz) = (@inn(Rxxyyzz) = @d2_yi(Axxyyzz); return) - Rxxyyzz.=0; @parallel inn_d_xa!(Rxxyyzz, Ax); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .≈ Ax[2:end, :, :].-Ax[1:end-1, :, :])) - Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .≈ 0)) # Test that boundary values remained zero. - Rxxyyzz.=0; @parallel inn_d_yi!(Rxxyyzz, Axxyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .≈ Axxyzz[2:end-1,2:end ,2:end-1].-Axxyzz[2:end-1,1:end-1,2:end-1])) - Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .≈ 0)) # Test that boundary values remained zero. - Rxxyyzz.=0; @parallel inn_d2_yi!(Rxxyyzz, Axxyyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .≈ (Axxyyzz[2:end-1,3:end,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,1:end-2,2:end-1]))) - Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .≈ 0)) # Test that boundary values remained zero. + Rxxyyzz.=0; @parallel inn_d_xa!(Rxxyyzz, Ax); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .== Ax[2:end, :, :].-Ax[1:end-1, :, :])) + Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .== 0)) # Test that boundary values remained zero. + Rxxyyzz.=0; @parallel inn_d_yi!(Rxxyyzz, Axxyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .== Axxyzz[2:end-1,2:end ,2:end-1].-Axxyzz[2:end-1,1:end-1,2:end-1])) + Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .== 0)) # Test that boundary values remained zero. + Rxxyyzz.=0; @parallel inn_d2_yi!(Rxxyyzz, Axxyyzz); @test all(Array(Rxxyyzz[2:end-1,2:end-1,2:end-1] .== (Axxyyzz[2:end-1,3:end,2:end-1].-Axxyyzz[2:end-1,2:end-1,2:end-1]).-(Axxyyzz[2:end-1,2:end-1,2:end-1].-Axxyyzz[2:end-1,1:end-2,2:end-1]))) + Rxxyyzz[2:end-1,2:end-1,2:end-1].=0; @test all(Array(Rxxyyzz .== 0)) # Test that boundary values remained zero. end; end; @reset_parallel_stencil() diff --git a/test/test_parallel.jl b/test/test_parallel.jl index dc1009d2..815aea37 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -220,7 +220,7 @@ eval(:( return end @parallel write_indices!(A); - @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end @testset "@parallel (3D; on-the-fly)" begin nx, ny, nz = 32, 8, 8 @@ -244,7 +244,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @static if $package in [$PKG_CUDA, $PKG_AMDGPU] # TODO add support for Metal @testset "@parallel memopt (nx, ny, nz = x .* threads)" begin # NOTE: the following does not work for some reason: (nx, ny, nz = ($nx, $ny, $nz))" for (nx, ny, nz) in ((32, 8, 9), (32, 8, 8), (31, 7, 9), (33, 9, 9), (33, 7, 8)) @@ -261,7 +261,7 @@ eval(:( return end @parallel memopt=true copy_memopt!(A2, A); - @test all(Array(A2) .≈ Array(A)) + @test all(Array(A2) .== Array(A)) end @testset "@parallel (3D, memopt, stencilranges=0:0)" begin A = @zeros(nx, ny, nz); @@ -272,7 +272,7 @@ eval(:( return end @parallel memopt=true copy_memopt!(A2, A); - @test all(Array(A2) .≈ Array(A)) + @test all(Array(A2) .== Array(A)) end @testset "@parallel_indices (3D, memopt, stencilranges=(0:0, 0:0, -1:1); z-stencil)" begin A = @zeros(nx, ny, nz); @@ -287,7 +287,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A); A2_ref[:,:,2:end-1] .= A[:,:,3:end] .- 2*A[:,:,2:end-1] .+ A[:,:,1:end-2]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(0:0, -1:1, 0:0); y-stencil)" begin A = @zeros(nx, ny, nz); @@ -302,7 +302,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A); A2_ref[:,2:end-1,:] .= A[:,3:end,:] .- 2*A[:,2:end-1,:] .+ A[:,1:end-2,:]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=(1:1, 1:1, 0:2); z-stencil)" begin A = @zeros(nx, ny, nz); @@ -315,7 +315,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A); A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,2:end-1,3:end] .- 2*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,2:end-1,1:end-2]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=(1:1, 0:2, 1:1); y-stencil)" begin A = @zeros(nx, ny, nz); @@ -328,7 +328,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A); A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,3:end,2:end-1] .- 2*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,1:end-2,2:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=-1:1)" begin lam=dt=_dx=_dy=_dz = 1 @@ -353,7 +353,7 @@ eval(:( - ((.-lam.*(T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]).*_dy) .- (.-lam.*(T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1]).*_dy)).*_dy - ((.-lam.*(T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]).*_dz) .- (.-lam.*(T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2]).*_dz)).*_dz) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:2)" begin lam=dt=_dx=_dy=_dz = 1 @@ -372,7 +372,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(-4:-1, 2:2, -2:3); x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -387,7 +387,7 @@ eval(:( end @parallel memopt=true higher_order_memopt!(A2, A); A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:2; on-the-fly)" begin lam=dt=_dx=_dy=_dz = 1 @@ -410,7 +410,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:0; 2 arrays)" begin A = @zeros(nx, ny, nz); @@ -423,7 +423,7 @@ eval(:( return end @parallel memopt=true copy_memopt!(A2, A, B); - @test all(Array(A2) .≈ Array(A) .+ Array(B)) + @test all(Array(A2) .== Array(A) .+ Array(B)) end @testset "@parallel_indices (3D, memopt; 2 arrays, z-stencil)" begin A = @zeros(nx, ny, nz); @@ -440,7 +440,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[:,:,2:end-1] .= A[:,:,3:end] .- 2*A[:,:,2:end-1] .+ A[:,:,1:end-2] .+ B[:,:,2:end] .- B[:,:,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel_indices (3D, memopt; 2 arrays, y-stencil)" begin A = @zeros(nx, ny, nz); @@ -457,7 +457,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[:,2:end-1,:] .= (((A[:,3:end,:] .- 2*A[:,2:end-1,:]) .+ A[:,1:end-2,:] .+ B[:,3:end,:]) .- 2*B[:,2:end-1,:]) .+ B[:,1:end-2,:]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel_indices (3D, memopt; 2 arrays, x-stencil)" begin A = @zeros(nx, ny, nz); @@ -474,7 +474,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[2:end-1,:,:] .= A[3:end,:,:] .- 2*A[2:end-1,:,:] .+ A[1:end-2,:,:] .+ B[2:end,:,:] .- B[1:end-1,:,:]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel (3D, memopt; 2 arrays, x-y-z- + z-stencil)" begin lam=dt=_dx=_dy=_dz = 1 @@ -494,7 +494,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel (3D, memopt; 2 arrays, x-y-z- + x-stencil)" begin lam=dt=_dx=_dy=_dz = 1 @@ -514,7 +514,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin lam=dt=_dx=_dy=_dz = 1 @@ -536,7 +536,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ) + ((B[2:end-1,3:end ,2:end-1] .- B[2:end-1,2:end-1,2:end-1]) .- (B[2:end-1,2:end-1,2:end-1] .- B[2:end-1,1:end-2,2:end-1])); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(-4:-1, 2:2, -2:3); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -567,9 +567,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[5:end-1,3:end,3:end-3] .= B[5:end-1,3:end,6:end] .- 2*B[3:end-3,3:end,3:end-3] .+ B[2:end-4,3:end,1:end-5]; C2_ref[5:end-1,3:end,3:end-3] .= C[5:end-1,3:end,6:end] .- 2*C[3:end-3,3:end,3:end-3] .+ C[2:end-4,3:end,1:end-5]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:-1, 2:2, 1:2), C=(-4:-1, 2:2, -1:0)); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -600,9 +600,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[5:end-1,3:end,2:end-1] .= B[5:end-1,3:end,3:end] .- 2*B[3:end-3,3:end,2:end-1] .+ B[2:end-4,3:end,2:end-1]; C2_ref[5:end-1,3:end,1:end-1] .= C[5:end-1,3:end,2:end] .- 2*C[3:end-3,3:end,1:end-1] .+ C[2:end-4,3:end,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -633,9 +633,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, C), loopdim=3, loopsize=3, optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -690,9 +690,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, C), loopdim=3, loopsize=3, optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -747,9 +747,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, B), loopdim=3, loopsize=3, optranges=(A=(-1:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:1)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -795,9 +795,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end end @testset "@parallel memopt (nx, ny, nz != x .* threads)" begin @@ -814,7 +814,7 @@ eval(:( end ranges = (1:64,1:64,1:8) # TODO: must be a multiple of the number of threads @parallel ranges memopt=true copy_memopt!(A2, A); - @test all(Array(A2) .≈ Array(A)) + @test all(Array(A2) .== Array(A)) end @testset "@parallel (3D, memopt, stencilranges=0:0)" begin A = @zeros(nx, ny, nz); @@ -825,7 +825,7 @@ eval(:( return end @parallel memopt=true copy_memopt!(A2, A); - @test all(Array(A2) .≈ Array(A)) + @test all(Array(A2) .== Array(A)) end @testset "@parallel (3D, memopt, stencilranges=0:2)" begin lam=dt=_dx=_dy=_dz = 1 @@ -844,7 +844,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin lam=dt=_dx=_dy=_dz = 1 @@ -866,7 +866,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ) + ((B[2:end-1,3:end ,2:end-1] .- B[2:end-1,2:end-1,2:end-1]) .- (B[2:end-1,2:end-1,2:end-1] .- B[2:end-1,1:end-2,2:end-1])); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end end end @@ -905,7 +905,7 @@ eval(:( - ((.-lam.*(T[3:end ,2:end-1,1] .- T[2:end-1,2:end-1,1]).*_dx) .- (.-lam.*(T[2:end-1,2:end-1,1] .- T[1:end-2,2:end-1,1]).*_dx)).*_dx - ((.-lam.*(T[2:end-1,3:end ,1] .- T[2:end-1,2:end-1,1]).*_dy) .- (.-lam.*(T[2:end-1,2:end-1,1] .- T[2:end-1,1:end-2,1]).*_dy)).*_dy) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end; end; @reset_parallel_stencil() @@ -934,7 +934,7 @@ eval(:( return end @parallel write_indices!(A, one); - @test all(Array(A) .≈ [(ix-1) for ix=1:size(A,1)]) + @test all(Array(A) .== [(ix-1) for ix=1:size(A,1)]) @reset_parallel_stencil() end; @testset "@parallel_indices (I...) (2D)" begin @@ -948,7 +948,7 @@ eval(:( return end @parallel write_indices!(A, one); - @test all(Array(A) .≈ [(ix-1) + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) + @test all(Array(A) .== [(ix-1) + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) @reset_parallel_stencil() end; @testset "@parallel_indices (I...) (3D)" begin @@ -962,7 +962,7 @@ eval(:( return end @parallel write_indices!(A, one); - @test all(Array(A) .≈ [(ix-1) + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .== [(ix-1) + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) @reset_parallel_stencil() end; end; From ab47d5fef53f2caf7f2dc4b5511de81c8cce0136 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Mon, 28 Oct 2024 17:04:24 +0100 Subject: [PATCH 34/46] Rollback average checks from division to multiplication with precision conversion --- test/test_FiniteDifferences1D.jl | 2 +- test/test_FiniteDifferences2D.jl | 10 +++++----- test/test_FiniteDifferences3D.jl | 26 +++++++++++++------------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/test/test_FiniteDifferences1D.jl b/test/test_FiniteDifferences1D.jl index 63681bc0..1cb35cee 100644 --- a/test/test_FiniteDifferences1D.jl +++ b/test/test_FiniteDifferences1D.jl @@ -52,7 +52,7 @@ eval(:( end; @testset "averages" begin @parallel av!(R, Ax) = (@all(R) = @av(Ax); return) - R.=0; @parallel av!(R, Ax); @test all(Array(R .== (Ax[1:end-1].+Ax[2:end])./2)) + R.=0; @parallel av!(R, Ax); @test all(Array(R .== (Ax[1:end-1].+Ax[2:end]).*$precision(0.5))) end; @testset "harmonic averages" begin @parallel harm!(R, Ax) = (@all(R) = @harm(Ax); return) diff --git a/test/test_FiniteDifferences2D.jl b/test/test_FiniteDifferences2D.jl index 96f73a13..acc7cac4 100644 --- a/test/test_FiniteDifferences2D.jl +++ b/test/test_FiniteDifferences2D.jl @@ -78,11 +78,11 @@ eval(:( @parallel av_ya!(R, Ay) = (@all(R) = @av_ya(Ay); return) @parallel av_xi!(R, Axyy) = (@all(R) = @av_xi(Axyy); return) @parallel av_yi!(R, Axxy) = (@all(R) = @av_yi(Axxy); return) - R.=0; @parallel av!(R, Axy); @test all(Array(R .== (Axy[1:end-1,1:end-1].+Axy[2:end,1:end-1].+Axy[1:end-1,2:end].+Axy[2:end,2:end])./4)) - R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .== (Ax[2:end, :].+Ax[1:end-1, :])./2)) - R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .== (Ay[ :,2:end].+Ay[ :,1:end-1])./2)) - R.=0; @parallel av_xi!(R, Axyy); @test all(Array(R .== (Axyy[2:end ,2:end-1].+Axyy[1:end-1,2:end-1])./2)) - R.=0; @parallel av_yi!(R, Axxy); @test all(Array(R .== (Axxy[2:end-1,2:end ].+Axxy[2:end-1,1:end-1])./2)) + R.=0; @parallel av!(R, Axy); @test all(Array(R .== (Axy[1:end-1,1:end-1].+Axy[2:end,1:end-1].+Axy[1:end-1,2:end].+Axy[2:end,2:end]).*$precision(0.25))) + R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .== (Ax[2:end, :].+Ax[1:end-1, :]).*$precision(0.5))) + R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .== (Ay[ :,2:end].+Ay[ :,1:end-1]).*$precision(0.5))) + R.=0; @parallel av_xi!(R, Axyy); @test all(Array(R .== (Axyy[2:end ,2:end-1].+Axyy[1:end-1,2:end-1]).*$precision(0.5))) + R.=0; @parallel av_yi!(R, Axxy); @test all(Array(R .== (Axxy[2:end-1,2:end ].+Axxy[2:end-1,1:end-1]).*$precision(0.5))) end; @testset "harmonic averages" begin @parallel harm!(R, Axy) = (@all(R) = @harm(Axy); return) diff --git a/test/test_FiniteDifferences3D.jl b/test/test_FiniteDifferences3D.jl index 5ec92b90..807f93ab 100644 --- a/test/test_FiniteDifferences3D.jl +++ b/test/test_FiniteDifferences3D.jl @@ -108,19 +108,19 @@ eval(:( @parallel av_xyi!(R, Axyzz) = (@all(R) = @av_xyi(Axyzz); return) @parallel av_xzi!(R, Axyyz) = (@all(R) = @av_xzi(Axyyz); return) @parallel av_yzi!(R, Axxyz) = (@all(R) = @av_yzi(Axxyz); return) - R.=0; @parallel av!(R, Axyz); @test all(Array(R .== (Axyz[1:end-1,1:end-1,1:end-1].+Axyz[2:end,1:end-1,1:end-1].+Axyz[2:end,2:end,1:end-1].+Axyz[2:end,2:end,2:end].+Axyz[1:end-1,2:end,2:end].+Axyz[1:end-1,1:end-1,2:end].+Axyz[2:end,1:end-1,2:end].+Axyz[1:end-1,2:end,1:end-1])./8)) - R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .== (Ax[2:end, :, :].+Ax[1:end-1, :, :])./2)) - R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .== (Ay[ :,2:end, :].+Ay[ :,1:end-1, :])./2)) - R.=0; @parallel av_za!(R, Az); @test all(Array(R .== (Az[ :, :,2:end].+Az[ :, :,1:end-1])./2)) - R.=0; @parallel av_xi!(R, Axyyzz); @test all(Array(R .== (Axyyzz[2:end ,2:end-1,2:end-1].+Axyyzz[1:end-1,2:end-1,2:end-1])./2)) - R.=0; @parallel av_yi!(R, Axxyzz); @test all(Array(R .== (Axxyzz[2:end-1,2:end ,2:end-1].+Axxyzz[2:end-1,1:end-1,2:end-1])./2)) - R.=0; @parallel av_zi!(R, Axxyyz); @test all(Array(R .== (Axxyyz[2:end-1,2:end-1,2:end ].+Axxyyz[2:end-1,2:end-1,1:end-1])./2)) - R.=0; @parallel av_xya!(R, Axy); @test all(Array(R .== (Axy[1:end-1,1:end-1,:].+Axy[2:end,1:end-1,:].+Axy[1:end-1,2:end,:].+Axy[2:end,2:end,:])./4)) - R.=0; @parallel av_xza!(R, Axz); @test all(Array(R .== (Axz[1:end-1,:,1:end-1].+Axz[2:end,:,1:end-1].+Axz[1:end-1,:,2:end].+Axz[2:end,:,2:end])./4)) - R.=0; @parallel av_yza!(R, Ayz); @test all(Array(R .== (Ayz[:,1:end-1,1:end-1].+Ayz[:,2:end,1:end-1].+Ayz[:,1:end-1,2:end].+Ayz[:,2:end,2:end])./4)) - R.=0; @parallel av_xyi!(R, Axyzz); @test all(Array(R .== (Axyzz[1:end-1,1:end-1,2:end-1].+Axyzz[2:end,1:end-1,2:end-1].+Axyzz[1:end-1,2:end,2:end-1].+Axyzz[2:end,2:end,2:end-1])./4)) - R.=0; @parallel av_xzi!(R, Axyyz); @test all(Array(R .== (Axyyz[1:end-1,2:end-1,1:end-1].+Axyyz[2:end,2:end-1,1:end-1].+Axyyz[1:end-1,2:end-1,2:end].+Axyyz[2:end,2:end-1,2:end])./4)) - R.=0; @parallel av_yzi!(R, Axxyz); @test all(Array(R .== (Axxyz[2:end-1,1:end-1,1:end-1].+Axxyz[2:end-1,2:end,1:end-1].+Axxyz[2:end-1,1:end-1,2:end].+Axxyz[2:end-1,2:end,2:end])./4)) + R.=0; @parallel av!(R, Axyz); @test all(Array(R .== (Axyz[1:end-1,1:end-1,1:end-1].+Axyz[2:end,1:end-1,1:end-1].+Axyz[2:end,2:end,1:end-1].+Axyz[2:end,2:end,2:end].+Axyz[1:end-1,2:end,2:end].+Axyz[1:end-1,1:end-1,2:end].+Axyz[2:end,1:end-1,2:end].+Axyz[1:end-1,2:end,1:end-1]).*$precision(0.125))) + R.=0; @parallel av_xa!(R, Ax); @test all(Array(R .== (Ax[2:end, :, :].+Ax[1:end-1, :, :]).*$precision(0.5))) + R.=0; @parallel av_ya!(R, Ay); @test all(Array(R .== (Ay[ :,2:end, :].+Ay[ :,1:end-1, :]).*$precision(0.5))) + R.=0; @parallel av_za!(R, Az); @test all(Array(R .== (Az[ :, :,2:end].+Az[ :, :,1:end-1]).*$precision(0.5))) + R.=0; @parallel av_xi!(R, Axyyzz); @test all(Array(R .== (Axyyzz[2:end ,2:end-1,2:end-1].+Axyyzz[1:end-1,2:end-1,2:end-1]).*$precision(0.5))) + R.=0; @parallel av_yi!(R, Axxyzz); @test all(Array(R .== (Axxyzz[2:end-1,2:end ,2:end-1].+Axxyzz[2:end-1,1:end-1,2:end-1]).*$precision(0.5))) + R.=0; @parallel av_zi!(R, Axxyyz); @test all(Array(R .== (Axxyyz[2:end-1,2:end-1,2:end ].+Axxyyz[2:end-1,2:end-1,1:end-1]).*$precision(0.5))) + R.=0; @parallel av_xya!(R, Axy); @test all(Array(R .== (Axy[1:end-1,1:end-1,:].+Axy[2:end,1:end-1,:].+Axy[1:end-1,2:end,:].+Axy[2:end,2:end,:]).*$precision(0.25))) + R.=0; @parallel av_xza!(R, Axz); @test all(Array(R .== (Axz[1:end-1,:,1:end-1].+Axz[2:end,:,1:end-1].+Axz[1:end-1,:,2:end].+Axz[2:end,:,2:end]).*$precision(0.25))) + R.=0; @parallel av_yza!(R, Ayz); @test all(Array(R .== (Ayz[:,1:end-1,1:end-1].+Ayz[:,2:end,1:end-1].+Ayz[:,1:end-1,2:end].+Ayz[:,2:end,2:end]).*$precision(0.25))) + R.=0; @parallel av_xyi!(R, Axyzz); @test all(Array(R .== (Axyzz[1:end-1,1:end-1,2:end-1].+Axyzz[2:end,1:end-1,2:end-1].+Axyzz[1:end-1,2:end,2:end-1].+Axyzz[2:end,2:end,2:end-1]).*$precision(0.25))) + R.=0; @parallel av_xzi!(R, Axyyz); @test all(Array(R .== (Axyyz[1:end-1,2:end-1,1:end-1].+Axyyz[2:end,2:end-1,1:end-1].+Axyyz[1:end-1,2:end-1,2:end].+Axyyz[2:end,2:end-1,2:end]).*$precision(0.25))) + R.=0; @parallel av_yzi!(R, Axxyz); @test all(Array(R .== (Axxyz[2:end-1,1:end-1,1:end-1].+Axxyz[2:end-1,2:end,1:end-1].+Axxyz[2:end-1,1:end-1,2:end].+Axxyz[2:end-1,2:end,2:end]).*$precision(0.25))) end; @testset "harmonic averages" begin @parallel harm!(R, Axyz) = (@all(R) = @harm(Axyz); return) From 7c9877e567c3d1a7e974d29ee24c2bc3cc28548c Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 29 Oct 2024 14:50:48 +0100 Subject: [PATCH 35/46] Fix tests for precision and comparisons --- test/ParallelKernel/test_kernel_language.jl | 4 +- test/test_parallel.jl | 82 ++++++++++----------- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl index 8cc48b37..c5a66912 100644 --- a/test/ParallelKernel/test_kernel_language.jl +++ b/test/ParallelKernel/test_kernel_language.jl @@ -41,7 +41,7 @@ eval(:( @test @prettystring(1, @blockDim()) == "CUDA.blockDim()" @test @prettystring(1, @threadIdx()) == "CUDA.threadIdx()" @test @prettystring(1, @sync_threads()) == "CUDA.sync_threads()" - @test @prettystring(1, @sharedMem($precision, (2,3))) == "CUDA.@cuDynamicSharedMem $precision (2, 3)" + @test @prettystring(1, @sharedMem($precision, (2,3))) == "CUDA.@cuDynamicSharedMem $(nameof($precision)) (2, 3)" # @test @prettystring(1, @pk_show()) == "CUDA.@cushow" # @test @prettystring(1, @pk_println()) == "CUDA.@cuprintln" elseif $package == $AMDGPU @@ -50,7 +50,7 @@ eval(:( @test @prettystring(1, @blockDim()) == "AMDGPU.workgroupDim()" @test @prettystring(1, @threadIdx()) == "AMDGPU.workitemIdx()" @test @prettystring(1, @sync_threads()) == "AMDGPU.sync_workgroup()" - # @test @prettystring(1, @sharedMem(Float32, (2,3))) == "" #TODO: not yet supported for AMDGPU + # @test @prettystring(1, @sharedMem($precision, (2,3))) == "" #TODO: not yet supported for AMDGPU # @test @prettystring(1, @pk_show()) == "CUDA.@cushow" #TODO: not yet supported for AMDGPU # @test @prettystring(1, @pk_println()) == "AMDGPU.@rocprintln" elseif $package == $PKG_METAL diff --git a/test/test_parallel.jl b/test/test_parallel.jl index 815aea37..07cbf707 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -220,7 +220,7 @@ eval(:( return end @parallel write_indices!(A); - @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end @testset "@parallel (3D; on-the-fly)" begin nx, ny, nz = 32, 8, 8 @@ -244,7 +244,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @static if $package in [$PKG_CUDA, $PKG_AMDGPU] # TODO add support for Metal @testset "@parallel memopt (nx, ny, nz = x .* threads)" begin # NOTE: the following does not work for some reason: (nx, ny, nz = ($nx, $ny, $nz))" for (nx, ny, nz) in ((32, 8, 9), (32, 8, 8), (31, 7, 9), (33, 9, 9), (33, 7, 8)) @@ -287,7 +287,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A); A2_ref[:,:,2:end-1] .= A[:,:,3:end] .- 2*A[:,:,2:end-1] .+ A[:,:,1:end-2]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(0:0, -1:1, 0:0); y-stencil)" begin A = @zeros(nx, ny, nz); @@ -302,7 +302,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A); A2_ref[:,2:end-1,:] .= A[:,3:end,:] .- 2*A[:,2:end-1,:] .+ A[:,1:end-2,:]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=(1:1, 1:1, 0:2); z-stencil)" begin A = @zeros(nx, ny, nz); @@ -315,7 +315,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A); A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,2:end-1,3:end] .- 2*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,2:end-1,1:end-2]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=(1:1, 0:2, 1:1); y-stencil)" begin A = @zeros(nx, ny, nz); @@ -328,7 +328,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A); A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,3:end,2:end-1] .- 2*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,1:end-2,2:end-1]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=-1:1)" begin lam=dt=_dx=_dy=_dz = 1 @@ -353,7 +353,7 @@ eval(:( - ((.-lam.*(T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]).*_dy) .- (.-lam.*(T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1]).*_dy)).*_dy - ((.-lam.*(T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]).*_dz) .- (.-lam.*(T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2]).*_dz)).*_dz) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:2)" begin lam=dt=_dx=_dy=_dz = 1 @@ -372,7 +372,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(-4:-1, 2:2, -2:3); x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -387,7 +387,7 @@ eval(:( end @parallel memopt=true higher_order_memopt!(A2, A); A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:2; on-the-fly)" begin lam=dt=_dx=_dy=_dz = 1 @@ -410,7 +410,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:0; 2 arrays)" begin A = @zeros(nx, ny, nz); @@ -423,7 +423,7 @@ eval(:( return end @parallel memopt=true copy_memopt!(A2, A, B); - @test all(Array(A2) .== Array(A) .+ Array(B)) + @test all(Array(A2) .≈ Array(A) .+ Array(B)) end @testset "@parallel_indices (3D, memopt; 2 arrays, z-stencil)" begin A = @zeros(nx, ny, nz); @@ -440,7 +440,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[:,:,2:end-1] .= A[:,:,3:end] .- 2*A[:,:,2:end-1] .+ A[:,:,1:end-2] .+ B[:,:,2:end] .- B[:,:,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel_indices (3D, memopt; 2 arrays, y-stencil)" begin A = @zeros(nx, ny, nz); @@ -457,7 +457,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[:,2:end-1,:] .= (((A[:,3:end,:] .- 2*A[:,2:end-1,:]) .+ A[:,1:end-2,:] .+ B[:,3:end,:]) .- 2*B[:,2:end-1,:]) .+ B[:,1:end-2,:]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel_indices (3D, memopt; 2 arrays, x-stencil)" begin A = @zeros(nx, ny, nz); @@ -474,7 +474,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[2:end-1,:,:] .= A[3:end,:,:] .- 2*A[2:end-1,:,:] .+ A[1:end-2,:,:] .+ B[2:end,:,:] .- B[1:end-1,:,:]; - @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt; 2 arrays, x-y-z- + z-stencil)" begin lam=dt=_dx=_dy=_dz = 1 @@ -494,7 +494,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 2 arrays, x-y-z- + x-stencil)" begin lam=dt=_dx=_dy=_dz = 1 @@ -514,7 +514,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin lam=dt=_dx=_dy=_dz = 1 @@ -536,7 +536,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ) + ((B[2:end-1,3:end ,2:end-1] .- B[2:end-1,2:end-1,2:end-1]) .- (B[2:end-1,2:end-1,2:end-1] .- B[2:end-1,1:end-2,2:end-1])); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(-4:-1, 2:2, -2:3); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -567,9 +567,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[5:end-1,3:end,3:end-3] .= B[5:end-1,3:end,6:end] .- 2*B[3:end-3,3:end,3:end-3] .+ B[2:end-4,3:end,1:end-5]; C2_ref[5:end-1,3:end,3:end-3] .= C[5:end-1,3:end,6:end] .- 2*C[3:end-3,3:end,3:end-3] .+ C[2:end-4,3:end,1:end-5]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:-1, 2:2, 1:2), C=(-4:-1, 2:2, -1:0)); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -600,9 +600,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[5:end-1,3:end,2:end-1] .= B[5:end-1,3:end,3:end] .- 2*B[3:end-3,3:end,2:end-1] .+ B[2:end-4,3:end,2:end-1]; C2_ref[5:end-1,3:end,1:end-1] .= C[5:end-1,3:end,2:end] .- 2*C[3:end-3,3:end,1:end-1] .+ C[2:end-4,3:end,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -633,9 +633,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, C), loopdim=3, loopsize=3, optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -690,9 +690,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, C), loopdim=3, loopsize=3, optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -747,9 +747,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, B), loopdim=3, loopsize=3, optranges=(A=(-1:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:1)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -795,9 +795,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .== Array(A2_ref)) - @test all(Array(B2) .== Array(B2_ref)) - @test all(Array(C2) .== Array(C2_ref)) + @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(B2) .≈ Array(B2_ref)) + @test all(Array(C2) .≈ Array(C2_ref)) end end @testset "@parallel memopt (nx, ny, nz != x .* threads)" begin @@ -844,7 +844,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin lam=dt=_dx=_dy=_dz = 1 @@ -866,7 +866,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ) + ((B[2:end-1,3:end ,2:end-1] .- B[2:end-1,2:end-1,2:end-1]) .- (B[2:end-1,2:end-1,2:end-1] .- B[2:end-1,1:end-2,2:end-1])); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end end end @@ -905,7 +905,7 @@ eval(:( - ((.-lam.*(T[3:end ,2:end-1,1] .- T[2:end-1,2:end-1,1]).*_dx) .- (.-lam.*(T[2:end-1,2:end-1,1] .- T[1:end-2,2:end-1,1]).*_dx)).*_dx - ((.-lam.*(T[2:end-1,3:end ,1] .- T[2:end-1,2:end-1,1]).*_dy) .- (.-lam.*(T[2:end-1,2:end-1,1] .- T[2:end-1,1:end-2,1]).*_dy)).*_dy) ); - @test all(Array(T2) .== Array(T2_ref)) + @test all(Array(T2) .≈ Array(T2_ref)) end; end; @reset_parallel_stencil() @@ -934,7 +934,7 @@ eval(:( return end @parallel write_indices!(A, one); - @test all(Array(A) .== [(ix-1) for ix=1:size(A,1)]) + @test all(Array(A) .≈ [(ix-1) for ix=1:size(A,1)]) @reset_parallel_stencil() end; @testset "@parallel_indices (I...) (2D)" begin @@ -948,7 +948,7 @@ eval(:( return end @parallel write_indices!(A, one); - @test all(Array(A) .== [(ix-1) + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) + @test all(Array(A) .≈ [(ix-1) + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) @reset_parallel_stencil() end; @testset "@parallel_indices (I...) (3D)" begin @@ -962,7 +962,7 @@ eval(:( return end @parallel write_indices!(A, one); - @test all(Array(A) .== [(ix-1) + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .≈ [(ix-1) + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) @reset_parallel_stencil() end; end; From 19b876353cbd83b498574a4825b7acdd5594fbdd Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 29 Oct 2024 14:55:11 +0100 Subject: [PATCH 36/46] Check for `Sys.isapple()` before importing Metal to avoid errors in tests --- test/ParallelKernel/test_hide_communication.jl | 8 ++++++-- test/ParallelKernel/test_init_parallel_kernel.jl | 8 ++++++-- test/ParallelKernel/test_kernel_language.jl | 8 ++++++-- test/ParallelKernel/test_parallel.jl | 8 ++++++-- test/ParallelKernel/test_reset_parallel_kernel.jl | 8 ++++++-- test/runtests.jl | 2 +- test/test_FiniteDifferences1D.jl | 8 ++++++-- test/test_FiniteDifferences2D.jl | 8 ++++++-- test/test_FiniteDifferences3D.jl | 8 ++++++-- test/test_extensions.jl | 8 ++++++-- test/test_incremental_compilation.jl | 8 ++++++-- test/test_init_parallel_stencil.jl | 8 ++++++-- test/test_parallel.jl | 8 ++++++-- test/test_reset_parallel_stencil.jl | 8 ++++++-- 14 files changed, 79 insertions(+), 27 deletions(-) diff --git a/test/ParallelKernel/test_hide_communication.jl b/test/ParallelKernel/test_hide_communication.jl index d018bc4c..e8ab02b3 100644 --- a/test/ParallelKernel/test_hide_communication.jl +++ b/test/ParallelKernel/test_hide_communication.jl @@ -15,8 +15,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/ParallelKernel/test_init_parallel_kernel.jl b/test/ParallelKernel/test_init_parallel_kernel.jl index c200362c..e26308bb 100644 --- a/test/ParallelKernel/test_init_parallel_kernel.jl +++ b/test/ParallelKernel/test_init_parallel_kernel.jl @@ -15,8 +15,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl index c5a66912..0e691b3b 100644 --- a/test/ParallelKernel/test_kernel_language.jl +++ b/test/ParallelKernel/test_kernel_language.jl @@ -15,8 +15,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/ParallelKernel/test_parallel.jl b/test/ParallelKernel/test_parallel.jl index fcba1dbf..1e0ea3f9 100644 --- a/test/ParallelKernel/test_parallel.jl +++ b/test/ParallelKernel/test_parallel.jl @@ -17,8 +17,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/ParallelKernel/test_reset_parallel_kernel.jl b/test/ParallelKernel/test_reset_parallel_kernel.jl index 1f404c04..fe2cc01a 100644 --- a/test/ParallelKernel/test_reset_parallel_kernel.jl +++ b/test/ParallelKernel/test_reset_parallel_kernel.jl @@ -13,8 +13,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/runtests.jl b/test/runtests.jl index 987a96bc..223f5b11 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,7 +5,7 @@ import ParallelStencil # Precompile it. import ParallelStencil: SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL @static if (PKG_CUDA in SUPPORTED_PACKAGES) import CUDA end @static if (PKG_AMDGPU in SUPPORTED_PACKAGES) import AMDGPU end -@static if (PKG_METAL in SUPPORTED_PACKAGES) import Metal end +@static if (PKG_METAL in SUPPORTED_PACKAGES && Sys.isapple()) import Metal end excludedfiles = [ "test_excluded.jl", "test_incremental_compilation.jl"]; # TODO: test_incremental_compilation has to be deactivated until Polyester support released diff --git a/test/test_FiniteDifferences1D.jl b/test/test_FiniteDifferences1D.jl index 1cb35cee..5d95b6b5 100644 --- a/test/test_FiniteDifferences1D.jl +++ b/test/test_FiniteDifferences1D.jl @@ -13,8 +13,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/test_FiniteDifferences2D.jl b/test/test_FiniteDifferences2D.jl index acc7cac4..7c1f13a1 100644 --- a/test/test_FiniteDifferences2D.jl +++ b/test/test_FiniteDifferences2D.jl @@ -13,8 +13,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/test_FiniteDifferences3D.jl b/test/test_FiniteDifferences3D.jl index 807f93ab..a8fb81b0 100644 --- a/test/test_FiniteDifferences3D.jl +++ b/test/test_FiniteDifferences3D.jl @@ -13,8 +13,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/test_extensions.jl b/test/test_extensions.jl index b9a47ec9..c79b7ded 100644 --- a/test/test_extensions.jl +++ b/test/test_extensions.jl @@ -11,8 +11,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/test_incremental_compilation.jl b/test/test_incremental_compilation.jl index 0a82ddf0..e7da4fab 100644 --- a/test/test_incremental_compilation.jl +++ b/test/test_incremental_compilation.jl @@ -10,8 +10,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/test_init_parallel_stencil.jl b/test/test_init_parallel_stencil.jl index c4ac67ee..6483cccd 100644 --- a/test/test_init_parallel_stencil.jl +++ b/test/test_init_parallel_stencil.jl @@ -14,8 +14,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/test_parallel.jl b/test/test_parallel.jl index 07cbf707..a3c66946 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -16,8 +16,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/test_reset_parallel_stencil.jl b/test/test_reset_parallel_stencil.jl index d160537e..08b66da5 100644 --- a/test/test_reset_parallel_stencil.jl +++ b/test/test_reset_parallel_stencil.jl @@ -12,8 +12,12 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester From 0fef75da722bf56c3facbd79e635ed52f94120e5 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 29 Oct 2024 14:57:41 +0100 Subject: [PATCH 37/46] Fix runtests --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 223f5b11..cb847afd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -26,7 +26,7 @@ function runtests() @warn "Test Skip: All AMDGPU tests will be skipped because AMDGPU is not functional (if this is unexpected type `import AMDGPU; AMDGPU.functional()` to debug your AMDGPU installation)." end - if (PKG_METAL in SUPPORTED_PACKAGES && !Metal.functional()) + if (PKG_METAL in SUPPORTED_PACKAGES && (!Sys.isapple() || !Metal.functional())) @warn "Test Skip: All Metal tests will be skipped because Metal is not functional (if this is unexpected type `import Metal; Metal.functional()` to debug your Metal installation)." end From 484ee179bc17ecd724d821f794be0570f31ab255 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 29 Oct 2024 15:00:54 +0100 Subject: [PATCH 38/46] Fix test_allocators --- test/ParallelKernel/test_allocators.jl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/test/ParallelKernel/test_allocators.jl b/test/ParallelKernel/test_allocators.jl index b58d3212..cc982fee 100644 --- a/test/ParallelKernel/test_allocators.jl +++ b/test/ParallelKernel/test_allocators.jl @@ -22,7 +22,15 @@ end @static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end - @define_MtlCellArray +end +@static if PKG_METAL in TEST_PACKAGES + @static if Sys.isapple() + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end + @define_MtlCellArray + else + TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) + end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester From 73d15fae1dbd78ce7b4493817f97547cc527ecb3 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Tue, 29 Oct 2024 15:04:01 +0100 Subject: [PATCH 39/46] Rollback some of the checks --- test/ParallelKernel/test_allocators.jl | 10 +--------- test/ParallelKernel/test_kernel_language.jl | 8 ++------ test/ParallelKernel/test_parallel.jl | 8 ++------ test/test_parallel.jl | 8 ++------ 4 files changed, 7 insertions(+), 27 deletions(-) diff --git a/test/ParallelKernel/test_allocators.jl b/test/ParallelKernel/test_allocators.jl index cc982fee..b58d3212 100644 --- a/test/ParallelKernel/test_allocators.jl +++ b/test/ParallelKernel/test_allocators.jl @@ -22,15 +22,7 @@ end @static if PKG_METAL in TEST_PACKAGES import Metal if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end -end -@static if PKG_METAL in TEST_PACKAGES - @static if Sys.isapple() - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end - @define_MtlCellArray - else - TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) - end + @define_MtlCellArray end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl index 0e691b3b..c5a66912 100644 --- a/test/ParallelKernel/test_kernel_language.jl +++ b/test/ParallelKernel/test_kernel_language.jl @@ -15,12 +15,8 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - @static if Sys.isapple() - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end - else - TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) - end + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/ParallelKernel/test_parallel.jl b/test/ParallelKernel/test_parallel.jl index 1e0ea3f9..fcba1dbf 100644 --- a/test/ParallelKernel/test_parallel.jl +++ b/test/ParallelKernel/test_parallel.jl @@ -17,12 +17,8 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - @static if Sys.isapple() - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end - else - TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) - end + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester diff --git a/test/test_parallel.jl b/test/test_parallel.jl index a3c66946..07cbf707 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -16,12 +16,8 @@ end if !AMDGPU.functional() TEST_PACKAGES = filter!(x->x≠PKG_AMDGPU, TEST_PACKAGES) end end @static if PKG_METAL in TEST_PACKAGES - @static if Sys.isapple() - import Metal - if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end - else - TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) - end + import Metal + if !Metal.functional() TEST_PACKAGES = filter!(x->x≠PKG_METAL, TEST_PACKAGES) end end @static if PKG_POLYESTER in TEST_PACKAGES import Polyester From 56a5d5537de9973c0b784e4175738bafd6d70078 Mon Sep 17 00:00:00 2001 From: Giacomo Aloisi <14826807+GiackAloZ@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:00:26 +0100 Subject: [PATCH 40/46] Apply suggestions from code review Co-authored-by: Samuel Omlin Co-authored-by: Albert de Montserrat <58044444+albert-de-montserrat@users.noreply.github.com> --- Project.toml | 2 +- src/ParallelKernel/parallel.jl | 6 +++--- src/ParallelStencil.jl | 2 +- src/init_parallel_stencil.jl | 2 +- src/parallel.jl | 6 +++--- test/ParallelKernel/test_hide_communication.jl | 2 +- test/ParallelKernel/test_kernel_language.jl | 2 +- test/ParallelKernel/test_parallel.jl | 2 +- test/test_FiniteDifferences1D.jl | 4 ++-- test/test_FiniteDifferences2D.jl | 4 ++-- test/test_FiniteDifferences3D.jl | 4 ++-- 11 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Project.toml b/Project.toml index fbe5a586..36d6396f 100644 --- a/Project.toml +++ b/Project.toml @@ -28,7 +28,7 @@ CUDA = "3.12, 4, 5" CellArrays = "0.3" Enzyme = "0.11, 0.12, 0.13" MacroTools = "0.5" -Metal = "^1.2" +Metal = "1.2" Polyester = "0.7" StaticArrays = "1" julia = "1.10" # Minimum version supporting Data module creation diff --git a/src/ParallelKernel/parallel.jl b/src/ParallelKernel/parallel.jl index 16b52e9f..24c4e3f8 100644 --- a/src/ParallelKernel/parallel.jl +++ b/src/ParallelKernel/parallel.jl @@ -15,8 +15,8 @@ Declare the `kernelcall` parallel. The kernel will automatically be called as re - `kernelcall`: a call to a kernel that is declared parallel. !!! note "Advanced optional arguments" - `ranges::Tuple{UnitRange{},UnitRange{},UnitRange{}} | Tuple{UnitRange{},UnitRange{}} | Tuple{UnitRange{}} | UnitRange{}`: the ranges of indices in each dimension for which computations must be performed. - - `nblocks::Tuple{Integer,Integer,Integer}`: the number of blocks to be used if the package CUDA or AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). - - `nthreads::Tuple{Integer,Integer,Integer}`: the number of threads to be used if the package CUDA or AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). + - `nblocks::Tuple{Integer,Integer,Integer}`: the number of blocks to be used if the package CUDA, AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). + - `nthreads::Tuple{Integer,Integer,Integer}`: the number of threads to be used if the package CUDA, AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). # Keyword arguments !!! note "Advanced" @@ -24,7 +24,7 @@ Declare the `kernelcall` parallel. The kernel will automatically be called as re - `ad_mode=Enzyme.Reverse`: the automatic differentiation mode (see the documentation of Enzyme.jl for more information). - `ad_annotations=()`: Enzyme variable annotations for automatic differentiation in the format `(=, =, ...)`, where `` can be a single variable or a tuple of variables (e.g., `ad_annotations=(Duplicated=B, Active=(a,b))`). Currently supported annotations are: $(keys(AD_SUPPORTED_ANNOTATIONS)). - `configcall=kernelcall`: a call to a kernel that is declared parallel, which is used for determining the kernel launch parameters. This keyword is useful, e.g., for generic automatic differentiation using the low-level submodule [`AD`](@ref). - - `backendkwargs...`: keyword arguments to be passed further to CUDA or AMDGPU or Metal (ignored for Threads or Polyester). + - `backendkwargs...`: keyword arguments to be passed further to CUDA, AMDGPU or Metal (ignored for Threads or Polyester). !!! note "Performance note" Kernel launch parameters are automatically defined with heuristics, where not defined with optional kernel arguments. For CUDA and AMDGPU, `nthreads` is typically set to (32,8,1) and `nblocks` accordingly to ensure that enough threads are launched. diff --git a/src/ParallelStencil.jl b/src/ParallelStencil.jl index 3c68ae4f..a46433a6 100644 --- a/src/ParallelStencil.jl +++ b/src/ParallelStencil.jl @@ -44,7 +44,7 @@ https://github.com/omlins/ParallelStencil.jl - [`Data`](@ref) !! note "Activation of GPU support" - The support for GPU (CUDA or AMDGPU or Metal) is provided with extensions and requires therefore an explicit installation of the corresponding packages (CUDA.jl or AMDGPU.jl or Metal.jl). Note that it is not required to import explicitly the corresponding module (CUDA or AMDGPU or Metal); this is automatically done by [`@init_parallel_stencil`](@ref). + The support for GPU (CUDA, AMDGPU or Metal) is provided with extensions and requires therefore an explicit installation of the corresponding packages (CUDA.jl, AMDGPU.jl or Metal.jl). Note that it is not required to import explicitly the corresponding module (CUDA, AMDGPU or Metal); this is automatically done by [`@init_parallel_stencil`](@ref). To see a description of a macro or module type `?` (including the `@`) or `?`, respectively. """ diff --git a/src/init_parallel_stencil.jl b/src/init_parallel_stencil.jl index a00ad385..23b1962b 100644 --- a/src/init_parallel_stencil.jl +++ b/src/init_parallel_stencil.jl @@ -28,7 +28,7 @@ Initialize the package ParallelStencil, giving access to its main functionality. Creates a module `Data` in the module where `@init_parallel_stencil` is called from. The module `Data` contains the types as `Data.Number`, `Data.Array` and `Data.CellArray` (type `?Data` *after* calling `@init_parallel_stencil` to see the full description of the module). # Arguments -- `package::Module`: the package used for parallelization (CUDA or AMDGPU or Metal for GPU, or Threads or Polyester for CPU). +- `package::Module`: the package used for parallelization (CUDA, AMDGPU or Metal for GPU, or Threads or Polyester for CPU). - `numbertype::DataType`: the type of numbers used by @zeros, @ones, @rand and @fill and in all array types of module `Data` (e.g. Float32 or Float64). It is contained in `Data.Number` after @init_parallel_stencil. The `numbertype` can be omitted if the other arguments are given as keyword arguments (in that case, the `numbertype` will have to be given explicitly when using the types provided by the module `Data`). - `ndims::Integer`: the number of dimensions used for the stencil computations in the kernels: 1, 2 or 3 (overwritable in each kernel definition). - `inbounds::Bool=false`: whether to apply `@inbounds` to the kernels by default (overwritable in each kernel definition). diff --git a/src/parallel.jl b/src/parallel.jl index 4f4a55ec..b3bd7f71 100644 --- a/src/parallel.jl +++ b/src/parallel.jl @@ -34,8 +34,8 @@ Declare the `kernelcall` parallel. The kernel will automatically be called as re - `kernelcall`: a call to a kernel that is declared parallel. !!! note "Advanced optional arguments" - `ranges::Tuple{UnitRange{},UnitRange{},UnitRange{}} | Tuple{UnitRange{},UnitRange{}} | Tuple{UnitRange{}} | UnitRange{}`: the ranges of indices in each dimension for which computations must be performed. - - `nblocks::Tuple{Integer,Integer,Integer}`: the number of blocks to be used if the package CUDA or AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). - - `nthreads::Tuple{Integer,Integer,Integer}`: the number of threads to be used if the package CUDA or AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). + - `nblocks::Tuple{Integer,Integer,Integer}`: the number of blocks to be used if the package CUDA, AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). + - `nthreads::Tuple{Integer,Integer,Integer}`: the number of threads to be used if the package CUDA, AMDGPU or Metal was selected with [`@init_parallel_kernel`](@ref). # Keyword arguments - `memopt::Bool=false`: whether the kernel to be launched was generated with `memopt=true` (meaning the keyword was set in the kernel declaration). @@ -44,7 +44,7 @@ Declare the `kernelcall` parallel. The kernel will automatically be called as re - `ad_mode=Enzyme.Reverse`: the automatic differentiation mode (see the documentation of Enzyme.jl for more information). - `ad_annotations=()`: Enzyme variable annotations for automatic differentiation in the format `(=, =, ...)`, where `` can be a single variable or a tuple of variables (e.g., `ad_annotations=(Duplicated=B, Active=(a,b))`). Currently supported annotations are: $(keys(AD_SUPPORTED_ANNOTATIONS)). - `configcall=kernelcall`: a call to a kernel that is declared parallel, which is used for determining the kernel launch parameters. This keyword is useful, e.g., for generic automatic differentiation using the low-level submodule [`AD`](@ref). - - `backendkwargs...`: keyword arguments to be passed further to CUDA or AMDGPU or Metal (ignored for Threads and Polyester). + - `backendkwargs...`: keyword arguments to be passed further to CUDA, AMDGPU or Metal (ignored for Threads and Polyester). !!! note "Performance note" Kernel launch parameters are automatically defined with heuristics, where not defined with optional kernel arguments. For CUDA and AMDGPU, `nthreads` is typically set to (32,8,1) and `nblocks` accordingly to ensure that enough threads are launched. diff --git a/test/ParallelKernel/test_hide_communication.jl b/test/ParallelKernel/test_hide_communication.jl index e8ab02b3..48171b19 100644 --- a/test/ParallelKernel/test_hide_communication.jl +++ b/test/ParallelKernel/test_hide_communication.jl @@ -28,7 +28,7 @@ end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. const TEST_PRECISIONS = [Float32, Float64] -for package in TEST_PACKAGES +@static for package in TEST_PACKAGES for precision in TEST_PRECISIONS (package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl index c5a66912..3f598f0d 100644 --- a/test/ParallelKernel/test_kernel_language.jl +++ b/test/ParallelKernel/test_kernel_language.jl @@ -24,7 +24,7 @@ end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. const TEST_PRECISIONS = [Float32, Float64] -for package in TEST_PACKAGES +@static for package in TEST_PACKAGES for precision in TEST_PRECISIONS (package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 diff --git a/test/ParallelKernel/test_parallel.jl b/test/ParallelKernel/test_parallel.jl index fcba1dbf..f02bceb1 100644 --- a/test/ParallelKernel/test_parallel.jl +++ b/test/ParallelKernel/test_parallel.jl @@ -30,7 +30,7 @@ macro compute_with_aliases(A) esc(:(ix + (iz -1)*size($A,1) import Enzyme const TEST_PRECISIONS = [Float32, Float64] -for package in TEST_PACKAGES +@static for package in TEST_PACKAGES for precision in TEST_PRECISIONS (package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 diff --git a/test/test_FiniteDifferences1D.jl b/test/test_FiniteDifferences1D.jl index 5d95b6b5..01f7a120 100644 --- a/test/test_FiniteDifferences1D.jl +++ b/test/test_FiniteDifferences1D.jl @@ -26,9 +26,9 @@ end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. const TEST_PRECISIONS = [Float32, Float64] -for package in TEST_PACKAGES +@static for package in TEST_PACKAGES for precision in TEST_PRECISIONS -(package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 +(package == PKG_METAL && precision == Float64) && continue # Metal does not support Float64 eval(:( @testset "$(basename(@__FILE__)) (package: $(nameof($package))) (precision: $(nameof($precision)))" begin diff --git a/test/test_FiniteDifferences2D.jl b/test/test_FiniteDifferences2D.jl index 7c1f13a1..d70b92a2 100644 --- a/test/test_FiniteDifferences2D.jl +++ b/test/test_FiniteDifferences2D.jl @@ -26,9 +26,9 @@ end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. const TEST_PRECISIONS = [Float32, Float64] -for package in TEST_PACKAGES +@static for package in TEST_PACKAGES for precision in TEST_PRECISIONS -(package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 +(package == PKG_METAL && precision == Float64) && continue # Metal does not support Float64 eval(:( @testset "$(basename(@__FILE__)) (package: $(nameof($package))) (precision: $(nameof($precision)))" begin diff --git a/test/test_FiniteDifferences3D.jl b/test/test_FiniteDifferences3D.jl index a8fb81b0..11db69db 100644 --- a/test/test_FiniteDifferences3D.jl +++ b/test/test_FiniteDifferences3D.jl @@ -26,9 +26,9 @@ end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. const TEST_PRECISIONS = [Float32, Float64] -for package in TEST_PACKAGES +@static for package in TEST_PACKAGES for precision in TEST_PRECISIONS -(package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 +(package == PKG_METAL && precision == Float64) && continue # Metal does not support Float64 eval(:( @testset "$(basename(@__FILE__)) (package: $(nameof($package))) (precision: $(nameof($precision)))" begin From dfb98febee8c82c43302a853ddf1bf1ab078e39b Mon Sep 17 00:00:00 2001 From: Giacomo Aloisi <14826807+GiackAloZ@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:02:37 +0100 Subject: [PATCH 41/46] Update test/test_parallel.jl Co-authored-by: Samuel Omlin --- test/test_parallel.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_parallel.jl b/test/test_parallel.jl index 07cbf707..08143670 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -27,7 +27,7 @@ Base.retry_load_extensions() # Potentially needed to load the extensions after t import ParallelStencil.@gorgeousexpand const TEST_PRECISIONS = [Float32, Float64] -for package in TEST_PACKAGES +@static for package in TEST_PACKAGES for precision in TEST_PRECISIONS (package == PKG_METAL && precision == Float64) ? continue : nothing # Metal does not support Float64 From 2d7d128c6915519cab3581d41a10b57d91998395 Mon Sep 17 00:00:00 2001 From: Giacomo Aloisi <14826807+GiackAloZ@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:02:51 +0100 Subject: [PATCH 42/46] Update test/test_parallel.jl Co-authored-by: Samuel Omlin --- test/test_parallel.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_parallel.jl b/test/test_parallel.jl index 08143670..0c67d6eb 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -878,7 +878,7 @@ eval(:( end; @reset_parallel_stencil() end; - @testset "2 parallel macros (2D)" begin + @testset "2. parallel macros (2D)" begin @require !@is_initialized() @init_parallel_stencil($package, $precision, 2) @require @is_initialized() From f7d1d7471b39a914a19266456e2923c6fda627c9 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 30 Oct 2024 14:24:58 +0100 Subject: [PATCH 43/46] Update test_parallel.jl to use the specified precision for lam and dt in the diffusion3D_step! function --- test/test_parallel.jl | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/test/test_parallel.jl b/test/test_parallel.jl index 0c67d6eb..105f92ad 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -224,13 +224,13 @@ eval(:( end @testset "@parallel (3D; on-the-fly)" begin nx, ny, nz = 32, 8, 8 - lam=dt=_dx=_dy=_dz = 1 + lam=dt=_dx=_dy=_dz = $precision(1) T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); Ci = @ones(nx, ny, nz); copy!(T, [ix + (iy-1)*size(T,1) + (iz-1)*size(T,1)*size(T,2) for ix=1:size(T,1), iy=1:size(T,2), iz=1:size(T,3)].^3); - @parallel function diffusion3D_step!(T2, T, Ci, lam, dt, _dx, _dy, _dz) + @parallel function diffusion3D_step!(T2, T, Ci, lam::Data.Number, dt::$precision, _dx, _dy, _dz) @all(qx) = -lam*@d_xi(T)*_dx # Fourier's law of heat conduction @all(qy) = -lam*@d_yi(T)*_dy # ... @all(qz) = -lam*@d_zi(T)*_dz # ... @@ -331,7 +331,7 @@ eval(:( @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=-1:1)" begin - lam=dt=_dx=_dy=_dz = 1 + lam=dt=_dx=_dy=_dz = $precision(1) T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -390,13 +390,13 @@ eval(:( @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:2; on-the-fly)" begin - lam=dt=_dx=_dy=_dz = 1 + lam=dt=_dx=_dy=_dz = $precision(1) T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); Ci = @ones(nx, ny, nz); copy!(T, [ix + (iy-1)*size(T,1) + (iz-1)*size(T,1)*size(T,2) for ix=1:size(T,1), iy=1:size(T,2), iz=1:size(T,3)].^3); - @parallel memopt=true loopsize=3 function diffusion3D_step!(T2, T, Ci, lam, dt, _dx, _dy, _dz) + @parallel memopt=true loopsize=3 function diffusion3D_step!(T2, T, Ci, lam::Data.Number, dt::$precision, _dx, _dy, _dz) @all(qx) = -lam*@d_xi(T)*_dx # Fourier's law of heat conduction @all(qy) = -lam*@d_yi(T)*_dy # ... @all(qz) = -lam*@d_zi(T)*_dz # ... @@ -477,7 +477,7 @@ eval(:( @test all(Array(A2) .≈ Array(A2_ref)) end @testset "@parallel (3D, memopt; 2 arrays, x-y-z- + z-stencil)" begin - lam=dt=_dx=_dy=_dz = 1 + lam=dt=_dx=_dy=_dz = $precision(1) T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -497,7 +497,7 @@ eval(:( @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 2 arrays, x-y-z- + x-stencil)" begin - lam=dt=_dx=_dy=_dz = 1 + lam=dt=_dx=_dy=_dz = $precision(1) T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -517,7 +517,7 @@ eval(:( @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin - lam=dt=_dx=_dy=_dz = 1 + lam=dt=_dx=_dy=_dz = $precision(1) T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -828,7 +828,7 @@ eval(:( @test all(Array(A2) .== Array(A)) end @testset "@parallel (3D, memopt, stencilranges=0:2)" begin - lam=dt=_dx=_dy=_dz = 1 + lam=dt=_dx=_dy=_dz = $precision(1) T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -847,7 +847,7 @@ eval(:( @test all(Array(T2) .≈ Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin - lam=dt=_dx=_dy=_dz = 1 + lam=dt=_dx=_dy=_dz = $precision(1) T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -885,7 +885,7 @@ eval(:( @static if $package in [$PKG_CUDA, $PKG_AMDGPU] # TODO add support for Metal nx, ny, nz = 32, 8, 1 @testset "@parallel_indices (2D, memopt, stencilranges=(-1:1,-1:1,0:0))" begin - lam=dt=_dx=_dy = 1 + lam=dt=_dx=_dy = $precision(1) T = @zeros(nx, ny, nz); T2 = @zeros(nx, ny, nz); T2_ref = @zeros(nx, ny, nz); @@ -928,7 +928,7 @@ eval(:( @init_parallel_stencil($package, $precision, 1) @require @is_initialized A = @zeros(4*5*6) - one = 1 + one = $precision(1) @parallel_indices (I...) function write_indices!(A, one) A[I...] = sum((I .- (1,)) .* (one)); return @@ -942,7 +942,7 @@ eval(:( @init_parallel_stencil($package, $precision, 2) @require @is_initialized A = @zeros(4, 5*6) - one = 1 + one = $precision(1) @parallel_indices (I...) function write_indices!(A, one) A[I...] = sum((I .- (1,)) .* (one, size(A,1))); return @@ -956,7 +956,7 @@ eval(:( @init_parallel_stencil($package, $precision, 3) @require @is_initialized A = @zeros(4, 5, 6) - one = 1 + one = $precision(1) @parallel_indices (I...) function write_indices!(A, one) A[I...] = sum((I .- (1,)) .* (one, size(A,1), size(A,1)*size(A,2))); return From 325defaa037f85bf632a5590abf13be34e972d59 Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 30 Oct 2024 14:54:55 +0100 Subject: [PATCH 44/46] Fix bitwise identical checks for specific tests that were failing --- test/test_parallel.jl | 86 +++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/test/test_parallel.jl b/test/test_parallel.jl index 105f92ad..d696865e 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -220,7 +220,7 @@ eval(:( return end @parallel write_indices!(A); - @test all(Array(A) .≈ [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .== [ix + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) end @testset "@parallel (3D; on-the-fly)" begin nx, ny, nz = 32, 8, 8 @@ -244,7 +244,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @static if $package in [$PKG_CUDA, $PKG_AMDGPU] # TODO add support for Metal @testset "@parallel memopt (nx, ny, nz = x .* threads)" begin # NOTE: the following does not work for some reason: (nx, ny, nz = ($nx, $ny, $nz))" for (nx, ny, nz) in ((32, 8, 9), (32, 8, 8), (31, 7, 9), (33, 9, 9), (33, 7, 8)) @@ -287,7 +287,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A); A2_ref[:,:,2:end-1] .= A[:,:,3:end] .- 2*A[:,:,2:end-1] .+ A[:,:,1:end-2]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(0:0, -1:1, 0:0); y-stencil)" begin A = @zeros(nx, ny, nz); @@ -302,7 +302,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A); A2_ref[:,2:end-1,:] .= A[:,3:end,:] .- 2*A[:,2:end-1,:] .+ A[:,1:end-2,:]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=(1:1, 1:1, 0:2); z-stencil)" begin A = @zeros(nx, ny, nz); @@ -314,8 +314,8 @@ eval(:( return end @parallel memopt=true d2_memopt!(A2, A); - A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,2:end-1,3:end] .- 2*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,2:end-1,1:end-2]; - @test all(Array(A2) .≈ Array(A2_ref)) + A2_ref[2:end-1,2:end-1,2:end-1] .= (A[2:end-1,2:end-1,3:end] .- A[2:end-1,2:end-1,2:end-1]) .- (A[2:end-1,2:end-1,2:end-1] .- A[2:end-1,2:end-1,1:end-2]); + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=(1:1, 0:2, 1:1); y-stencil)" begin A = @zeros(nx, ny, nz); @@ -327,8 +327,8 @@ eval(:( return end @parallel memopt=true d2_memopt!(A2, A); - A2_ref[2:end-1,2:end-1,2:end-1] .= A[2:end-1,3:end,2:end-1] .- 2*A[2:end-1,2:end-1,2:end-1] .+ A[2:end-1,1:end-2,2:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) + A2_ref[2:end-1,2:end-1,2:end-1] .= (A[2:end-1,3:end,2:end-1] .- A[2:end-1,2:end-1,2:end-1]) .- (A[2:end-1,2:end-1,2:end-1] .- A[2:end-1,1:end-2,2:end-1]); + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=-1:1)" begin lam=dt=_dx=_dy=_dz = $precision(1) @@ -353,7 +353,7 @@ eval(:( - ((.-lam.*(T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]).*_dy) .- (.-lam.*(T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1]).*_dy)).*_dy - ((.-lam.*(T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]).*_dz) .- (.-lam.*(T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2]).*_dz)).*_dz) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:2)" begin lam=dt=_dx=_dy=_dz = 1 @@ -372,7 +372,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(-4:-1, 2:2, -2:3); x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -387,7 +387,7 @@ eval(:( end @parallel memopt=true higher_order_memopt!(A2, A); A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:2; on-the-fly)" begin lam=dt=_dx=_dy=_dz = $precision(1) @@ -410,7 +410,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel (3D, memopt, stencilranges=0:0; 2 arrays)" begin A = @zeros(nx, ny, nz); @@ -423,7 +423,7 @@ eval(:( return end @parallel memopt=true copy_memopt!(A2, A, B); - @test all(Array(A2) .≈ Array(A) .+ Array(B)) + @test all(Array(A2) .== Array(A) .+ Array(B)) end @testset "@parallel_indices (3D, memopt; 2 arrays, z-stencil)" begin A = @zeros(nx, ny, nz); @@ -440,7 +440,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[:,:,2:end-1] .= A[:,:,3:end] .- 2*A[:,:,2:end-1] .+ A[:,:,1:end-2] .+ B[:,:,2:end] .- B[:,:,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel_indices (3D, memopt; 2 arrays, y-stencil)" begin A = @zeros(nx, ny, nz); @@ -457,7 +457,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[:,2:end-1,:] .= (((A[:,3:end,:] .- 2*A[:,2:end-1,:]) .+ A[:,1:end-2,:] .+ B[:,3:end,:]) .- 2*B[:,2:end-1,:]) .+ B[:,1:end-2,:]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel_indices (3D, memopt; 2 arrays, x-stencil)" begin A = @zeros(nx, ny, nz); @@ -474,7 +474,7 @@ eval(:( end @parallel memopt=true d2_memopt!(A2, A, B); A2_ref[2:end-1,:,:] .= A[3:end,:,:] .- 2*A[2:end-1,:,:] .+ A[1:end-2,:,:] .+ B[2:end,:,:] .- B[1:end-1,:,:]; - @test all(Array(A2) .≈ Array(A2_ref)) + @test all(Array(A2) .== Array(A2_ref)) end @testset "@parallel (3D, memopt; 2 arrays, x-y-z- + z-stencil)" begin lam=dt=_dx=_dy=_dz = $precision(1) @@ -494,7 +494,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel (3D, memopt; 2 arrays, x-y-z- + x-stencil)" begin lam=dt=_dx=_dy=_dz = $precision(1) @@ -514,7 +514,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin lam=dt=_dx=_dy=_dz = $precision(1) @@ -536,7 +536,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ) + ((B[2:end-1,3:end ,2:end-1] .- B[2:end-1,2:end-1,2:end-1]) .- (B[2:end-1,2:end-1,2:end-1] .- B[2:end-1,1:end-2,2:end-1])); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(-4:-1, 2:2, -2:3); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -567,9 +567,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[5:end-1,3:end,3:end-3] .= B[5:end-1,3:end,6:end] .- 2*B[3:end-3,3:end,3:end-3] .+ B[2:end-4,3:end,1:end-5]; C2_ref[5:end-1,3:end,3:end-3] .= C[5:end-1,3:end,6:end] .- 2*C[3:end-3,3:end,3:end-3] .+ C[2:end-4,3:end,1:end-5]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:-1, 2:2, 1:2), C=(-4:-1, 2:2, -1:0)); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -600,9 +600,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[5:end-1,3:end,2:end-1] .= B[5:end-1,3:end,3:end] .- 2*B[3:end-3,3:end,2:end-1] .+ B[2:end-4,3:end,2:end-1]; C2_ref[5:end-1,3:end,1:end-1] .= C[5:end-1,3:end,2:end] .- 2*C[3:end-3,3:end,1:end-1] .+ C[2:end-4,3:end,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -633,9 +633,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, C), loopdim=3, loopsize=3, optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -690,9 +690,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, C), loopdim=3, loopsize=3, optranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -747,9 +747,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end @testset "@parallel_indices (3D, memopt, optvars=(A, B), loopdim=3, loopsize=3, optranges=(A=(-1:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:1)); stencilranges=(A=(-4:-1, 2:2, -2:3), B=(-4:1, 2:2, 1:2), C=(-1:-1, 2:2, -1:0)), 3 arrays, x-z-stencil, y-shift)" begin A = @zeros(nx, ny, nz); @@ -795,9 +795,9 @@ eval(:( A2_ref[5:end-1,3:end,3:end-3] .= A[5:end-1,3:end,6:end] .- 2*A[3:end-3,3:end,3:end-3] .+ A[2:end-4,3:end,1:end-5]; B2_ref[7:end-1,3:end,2:end-1] .= B[7:end-1,3:end,3:end] .- 2*B[3:end-5,3:end,2:end-1] .+ B[2:end-6,3:end,2:end-1]; C2_ref[2:end-1,3:end,1:end-1] .= C[2:end-1,3:end,2:end] .- 2*C[2:end-1,3:end,1:end-1] .+ C[2:end-1,3:end,1:end-1]; - @test all(Array(A2) .≈ Array(A2_ref)) - @test all(Array(B2) .≈ Array(B2_ref)) - @test all(Array(C2) .≈ Array(C2_ref)) + @test all(Array(A2) .== Array(A2_ref)) + @test all(Array(B2) .== Array(B2_ref)) + @test all(Array(C2) .== Array(C2_ref)) end end @testset "@parallel memopt (nx, ny, nz != x .* threads)" begin @@ -844,7 +844,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end @testset "@parallel (3D, memopt; 3 arrays, x-y-z- + y- + x-stencil)" begin lam=dt=_dx=_dy=_dz = $precision(1) @@ -866,7 +866,7 @@ eval(:( + ((T[2:end-1,3:end ,2:end-1] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,1:end-2,2:end-1])).*_dy^2 + ((T[2:end-1,2:end-1,3:end ] .- T[2:end-1,2:end-1,2:end-1]) .- (T[2:end-1,2:end-1,2:end-1] .- T[2:end-1,2:end-1,1:end-2])).*_dz^2) ) + ((B[2:end-1,3:end ,2:end-1] .- B[2:end-1,2:end-1,2:end-1]) .- (B[2:end-1,2:end-1,2:end-1] .- B[2:end-1,1:end-2,2:end-1])); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end end end @@ -905,7 +905,7 @@ eval(:( - ((.-lam.*(T[3:end ,2:end-1,1] .- T[2:end-1,2:end-1,1]).*_dx) .- (.-lam.*(T[2:end-1,2:end-1,1] .- T[1:end-2,2:end-1,1]).*_dx)).*_dx - ((.-lam.*(T[2:end-1,3:end ,1] .- T[2:end-1,2:end-1,1]).*_dy) .- (.-lam.*(T[2:end-1,2:end-1,1] .- T[2:end-1,1:end-2,1]).*_dy)).*_dy) ); - @test all(Array(T2) .≈ Array(T2_ref)) + @test all(Array(T2) .== Array(T2_ref)) end; end; @reset_parallel_stencil() @@ -934,7 +934,7 @@ eval(:( return end @parallel write_indices!(A, one); - @test all(Array(A) .≈ [(ix-1) for ix=1:size(A,1)]) + @test all(Array(A) .== [(ix-1) for ix=1:size(A,1)]) @reset_parallel_stencil() end; @testset "@parallel_indices (I...) (2D)" begin @@ -948,7 +948,7 @@ eval(:( return end @parallel write_indices!(A, one); - @test all(Array(A) .≈ [(ix-1) + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) + @test all(Array(A) .== [(ix-1) + (iy-1)*size(A,1) for ix=1:size(A,1), iy=1:size(A,2)]) @reset_parallel_stencil() end; @testset "@parallel_indices (I...) (3D)" begin @@ -962,7 +962,7 @@ eval(:( return end @parallel write_indices!(A, one); - @test all(Array(A) .≈ [(ix-1) + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) + @test all(Array(A) .== [(ix-1) + (iy-1)*size(A,1) + (iz-1)*size(A,1)*size(A,2) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]) @reset_parallel_stencil() end; end; From 7826b4d5144e86d18a88df5682270e93032b8a83 Mon Sep 17 00:00:00 2001 From: Giacomo Aloisi <14826807+GiackAloZ@users.noreply.github.com> Date: Wed, 30 Oct 2024 15:03:36 +0100 Subject: [PATCH 45/46] Update test/ParallelKernel/test_allocators.jl Co-authored-by: Samuel Omlin --- test/ParallelKernel/test_allocators.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/ParallelKernel/test_allocators.jl b/test/ParallelKernel/test_allocators.jl index b58d3212..c0350d81 100644 --- a/test/ParallelKernel/test_allocators.jl +++ b/test/ParallelKernel/test_allocators.jl @@ -30,7 +30,7 @@ end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. const DATA_INDEX = ParallelStencil.INT_THREADS # TODO: using Data.Index does not work in combination with @reset_parallel_kernel, because the macros from module Test alternate the order of evaluation, resulting in the Data module being replaced with an empty module before Data.Index is evaluated. If at some point the indexing varies depending on the used package, then something more sophisticated is needed here (e.g., wrapping the test for each package in a module and using then Data.Index everywhere). -for package in TEST_PACKAGES +@static for package in TEST_PACKAGES eval(:( @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin From 71798167251da63cf90c657440c8a7117498673c Mon Sep 17 00:00:00 2001 From: GiackAloZ Date: Wed, 30 Oct 2024 15:28:52 +0100 Subject: [PATCH 46/46] Update to use Metal.device() instead of Metal.c u rrent_device() (the latter is deprecated) Also add tests there were TODO --- src/ParallelKernel/MetalExt/shared.jl | 4 ++-- src/ParallelKernel/parallel.jl | 2 +- test/ParallelKernel/test_kernel_language.jl | 4 ++-- test/ParallelKernel/test_parallel.jl | 12 +++++++++++- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/ParallelKernel/MetalExt/shared.jl b/src/ParallelKernel/MetalExt/shared.jl index 8387dc37..60b71499 100644 --- a/src/ParallelKernel/MetalExt/shared.jl +++ b/src/ParallelKernel/MetalExt/shared.jl @@ -19,12 +19,12 @@ let metalqueues = Array{MTL.MTLCommandQueue}(undef, 0) function get_priority_metalstream(id::Integer) - while (id > length(priority_metalqueues)) push!(priority_metalqueues, MTL.MTLCommandQueue(Metal.current_device())) end # No priority setting available in Metal queues. + while (id > length(priority_metalqueues)) push!(priority_metalqueues, MTL.MTLCommandQueue(Metal.device())) end # No priority setting available in Metal queues. return priority_metalqueues[id] end function get_metalstream(id::Integer) - while (id > length(metalqueues)) push!(metalqueues, MTL.MTLCommandQueue(Metal.current_device())) end + while (id > length(metalqueues)) push!(metalqueues, MTL.MTLCommandQueue(Metal.device())) end return metalqueues[id] end end \ No newline at end of file diff --git a/src/ParallelKernel/parallel.jl b/src/ParallelKernel/parallel.jl index 24c4e3f8..8fb54f5b 100644 --- a/src/ParallelKernel/parallel.jl +++ b/src/ParallelKernel/parallel.jl @@ -598,7 +598,7 @@ end function default_stream(package) if (package == PKG_CUDA) return :(CUDA.stream()) # Use the default stream of the task. elseif (package == PKG_AMDGPU) return :(AMDGPU.stream()) # Use the default stream of the task. - elseif (package == PKG_METAL) return :(Metal.global_queue(Metal.current_device())) # Use the default queue of the task. + elseif (package == PKG_METAL) return :(Metal.global_queue(Metal.device())) # Use the default queue of the task. else @ModuleInternalError("unsupported GPU package (obtained: $package).") end end \ No newline at end of file diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl index 3f598f0d..fe4ffd76 100644 --- a/test/ParallelKernel/test_kernel_language.jl +++ b/test/ParallelKernel/test_kernel_language.jl @@ -60,8 +60,8 @@ eval(:( @test @prettystring(1, @threadIdx()) == "Metal.thread_position_in_threadgroup_3d()" @test @prettystring(1, @sync_threads()) == "Metal.threadgroup_barrier(; flag = Metal.MemoryFlagThreadGroup)" @test @prettystring(1, @sharedMem($precision, (2,3))) == "ParallelStencil.ParallelKernel.@sharedMem_metal $(nameof($precision)) (2, 3)" - # @test @prettystring(1, @pk_show()) == "Metal.@mtlshow" - # @test @prettystring(1, @pk_println()) == "Metal.@mtlprintln" + # @test @prettystring(1, @pk_show()) == "Metal.@mtlshow" #TODO: not yet supported for Metal + # @test @prettystring(1, @pk_println()) == "Metal.@mtlprintln" #TODO: not yet supported for Metal elseif @iscpu($package) @test @prettystring(1, @gridDim()) == "ParallelStencil.ParallelKernel.@gridDim_cpu" @test @prettystring(1, @blockIdx()) == "ParallelStencil.ParallelKernel.@blockIdx_cpu" diff --git a/test/ParallelKernel/test_parallel.jl b/test/ParallelKernel/test_parallel.jl index f02bceb1..a6585847 100644 --- a/test/ParallelKernel/test_parallel.jl +++ b/test/ParallelKernel/test_parallel.jl @@ -66,7 +66,17 @@ eval(:( call = @prettystring(1, @parallel nblocks nthreads stream=mystream f(A)) @test occursin("AMDGPU.@roc gridsize = nblocks groupsize = nthreads stream = mystream f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[3])))", call) elseif $package == $PKG_METAL - ## TODO + call = @prettystring(1, @parallel f(A)) + @test occursin("Metal.@metal groups = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 32)) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 32) queue = Metal.global_queue(Metal.device()) f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))", call) + @test occursin("Metal.synchronize(Metal.global_queue(Metal.device()))", call) + call = @prettystring(1, @parallel ranges f(A)) + @test occursin("Metal.@metal groups = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 32)) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 32) queue = Metal.global_queue(Metal.device()) f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) + call = @prettystring(1, @parallel nblocks nthreads f(A)) + @test occursin("Metal.@metal groups = nblocks threads = nthreads queue = Metal.global_queue(Metal.device()) f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[3])))", call) + call = @prettystring(1, @parallel ranges nblocks nthreads f(A)) + @test occursin("Metal.@metal groups = nblocks threads = nthreads queue = Metal.global_queue(Metal.device()) f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) + call = @prettystring(1, @parallel nblocks nthreads stream=mystream f(A)) + @test occursin("Metal.@metal groups = nblocks threads = nthreads queue = mystream f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[3])))", call) elseif @iscpu($package) @test @prettystring(1, @parallel f(A)) == "f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))" @test @prettystring(1, @parallel ranges f(A)) == "f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))"