From 11424af8fe2f5b4d41a2ae36a7b3d2961fe9d836 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Tue, 18 Jun 2024 19:13:47 +0200 Subject: [PATCH 1/7] fix async macro --- src/parallel.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parallel.jl b/src/parallel.jl index 049e0363..93f2ea0f 100644 --- a/src/parallel.jl +++ b/src/parallel.jl @@ -162,7 +162,7 @@ function parallel(source::LineNumberNode, caller::Module, args::Union{Symbol,Exp if (length(posargs) > 1) @ArgumentError("maximum one positional argument (ranges) is allowed in a @parallel memopt=true call.") end parallel_call_memopt(caller, posargs..., kernelarg, backend_kwargs_expr, async; kwargs...) else - ParallelKernel.parallel(caller, posargs..., backend_kwargs_expr..., configcall_kwarg_expr, kernelarg; package=package) + ParallelKernel.parallel(caller, posargs..., backend_kwargs_expr..., configcall_kwarg_expr, kernelarg; package=package, async=async) end end end From e27d1f3426963f2fc3c3493bb462b4ff6c98e6e8 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 5 Jul 2024 16:46:28 +0200 Subject: [PATCH 2/7] introduce nthreads_x_max for amdgpu --- src/ParallelKernel/parallel.jl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ParallelKernel/parallel.jl b/src/ParallelKernel/parallel.jl index 4f016703..18bfc1b2 100644 --- a/src/ParallelKernel/parallel.jl +++ b/src/ParallelKernel/parallel.jl @@ -269,8 +269,9 @@ function parallel_call_gpu(nblocks::Union{Symbol,Expr}, nthreads::Union{Symbol,E end function parallel_call_gpu(ranges::Union{Symbol,Expr}, kernelcall::Expr, backend_kwargs_expr::Array, async::Bool, package::Symbol; stream::Union{Symbol,Expr}=default_stream(package), shmem::Union{Symbol,Expr,Nothing}=nothing, launch::Bool=true, configcall::Expr=kernelcall) + nthreads_x_max = determine_nthreads_x_max(package) maxsize = :(length.(ParallelStencil.ParallelKernel.promote_ranges($ranges))) - nthreads = :( ParallelStencil.ParallelKernel.compute_nthreads($maxsize) ) + nthreads = :( ParallelStencil.ParallelKernel.compute_nthreads($maxsize; nthreads_x_max=$nthreads_x_max) ) nblocks = :( ParallelStencil.ParallelKernel.compute_nblocks($maxsize, $nthreads) ) parallel_call_gpu(ranges, nblocks, nthreads, kernelcall, backend_kwargs_expr, async, package; stream=stream, shmem=shmem, launch=launch) end @@ -522,9 +523,9 @@ function compute_ranges(maxsize) return (1:maxsize[1], 1:maxsize[2], 1:maxsize[3]) end -function compute_nthreads(maxsize; nthreads_max=NTHREADS_MAX, flatdim=0) # This is a heuristic, which results in (32,8,1) threads, except if maxsize[1] < 32 or maxsize[2] < 8. +function compute_nthreads(maxsize; nthreads_x_max=NTHREADS_X_MAX, nthreads_max=NTHREADS_MAX, flatdim=0) # This is a heuristic, which results in (32,8,1) threads, except if maxsize[1] < 32 or maxsize[2] < 8. maxsize = promote_maxsize(maxsize) - nthreads_x = min(32, (flatdim==1) ? 1 : maxsize[1]) + nthreads_x = min(nthreads_x_max, (flatdim==1) ? 1 : maxsize[1]) nthreads_y = min(ceil(Int,nthreads_max/nthreads_x), (flatdim==2) ? 1 : maxsize[2]) nthreads_z = min(ceil(Int,nthreads_max/(nthreads_x*nthreads_y)), (flatdim==3) ? 1 : maxsize[3]) return (nthreads_x, nthreads_y , nthreads_z) @@ -536,6 +537,8 @@ function compute_nblocks(maxsize, nthreads) return ceil.(Int, maxsize./nthreads) end +determine_nthreads_x_max(package::Symbol) = (package == PKG_AMDGPU) ? NTHREADS_X_MAX_AMDGPU : NTHREADS_X_MAX + ## FUNCTIONS TO CREATE KERNEL LAUNCH AND SYNCHRONIZATION CALLS From a9a662992b7ccb0c7b450f3436e45b298c2b6ff1 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 5 Jul 2024 16:47:34 +0200 Subject: [PATCH 3/7] introduce nthreads_x_max for amdgpu --- src/ParallelKernel/shared.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ParallelKernel/shared.jl b/src/ParallelKernel/shared.jl index 2c90004d..da55bbc3 100644 --- a/src/ParallelKernel/shared.jl +++ b/src/ParallelKernel/shared.jl @@ -20,6 +20,8 @@ const INT_CUDA = Int64 # NOTE: unsigned integers are not yet const INT_AMDGPU = Int64 # NOTE: ... const INT_POLYESTER = Int64 # NOTE: ... const INT_THREADS = Int64 # NOTE: ... +const NTHREADS_X_MAX = 32 +const NTHREADS_X_MAX_AMDGPU = 64 const NTHREADS_MAX = 256 const INDICES = (gensym_world("ix", @__MODULE__), gensym_world("iy", @__MODULE__), gensym_world("iz", @__MODULE__)) const RANGES_VARNAME = gensym_world("ranges", @__MODULE__) From 7bdb7ca1b138414e40afb2175ddaec730bf03583 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 5 Jul 2024 16:49:05 +0200 Subject: [PATCH 4/7] introduce nthreads_max for amdgpu --- src/parallel.jl | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/src/parallel.jl b/src/parallel.jl index 93f2ea0f..02983da7 100644 --- a/src/parallel.jl +++ b/src/parallel.jl @@ -321,6 +321,9 @@ end function parallel_call_memopt(caller::Module, ranges::Union{Symbol,Expr}, kernelcall::Expr, backend_kwargs_expr::Array, async::Bool; memopt::Bool=false, configcall::Expr=kernelcall) if haskey(backend_kwargs_expr, :shmem) @KeywordArgumentError("@parallel : keyword `shmem` is not allowed when memopt=true is set.") end + package = get_package(caller) + nthreads_x_max = ParallelKernel.determine_nthreads_x_max(package) + nthreads_max_memopt = determine_nthreads_max_memopt(package) configcall_kwarg_expr = :(configcall=$configcall) metadata_call = create_metadata_call(configcall) metadata_module = metadata_call @@ -331,7 +334,7 @@ function parallel_call_memopt(caller::Module, ranges::Union{Symbol,Expr}, kernel loopsize = :($(metadata_module).loopsize) loopsizes = :(($loopdim==3) ? (1, 1, $loopsize) : ($loopdim==2) ? (1, $loopsize, 1) : ($loopsize, 1, 1)) maxsize = :(cld.(length.(ParallelStencil.ParallelKernel.promote_ranges($ranges)), $loopsizes)) - nthreads = :( ParallelStencil.compute_nthreads_memopt($maxsize, $loopdim, $stencilranges) ) + nthreads = :( ParallelStencil.compute_nthreads_memopt($nthreads_x_max, $nthreads_max_memopt, $maxsize, $loopdim, $stencilranges) ) nblocks = :( ParallelStencil.ParallelKernel.compute_nblocks($maxsize, $nthreads) ) numbertype = get_numbertype(caller) # not :(eltype($(optvars)[1])) # TODO: see how to obtain number type properly for each array: the type of the call call arguments corresponding to the optimization variables should be checked dim1 = :(($loopdim==3) ? 1 : ($loopdim==2) ? 1 : 2) # TODO: to be determined if that is what is desired for loopdim 1 and 2. @@ -344,11 +347,14 @@ function parallel_call_memopt(caller::Module, ranges::Union{Symbol,Expr}, kernel end function parallel_call_memopt(caller::Module, kernelcall::Expr, backend_kwargs_expr::Array, async::Bool; memopt::Bool=false, configcall::Expr=kernelcall) - metadata_call = create_metadata_call(configcall) - metadata_module = metadata_call - loopdim = :($(metadata_module).loopdim) - is_parallel_kernel = :($(metadata_module).is_parallel_kernel) - ranges = :( ($is_parallel_kernel) ? ParallelStencil.get_ranges_memopt($loopdim, $(configcall.args[2:end]...)) : ParallelStencil.ParallelKernel.get_ranges($(configcall.args[2:end]...))) + package = get_package(caller) + nthreads_x_max = ParallelKernel.determine_nthreads_x_max(package) + nthreads_max_memopt = determine_nthreads_max_memopt(package) + metadata_call = create_metadata_call(configcall) + metadata_module = metadata_call + loopdim = :($(metadata_module).loopdim) + is_parallel_kernel = :($(metadata_module).is_parallel_kernel) + ranges = :( ($is_parallel_kernel) ? ParallelStencil.get_ranges_memopt($nthreads_x_max, $nthreads_max_memopt, $loopdim, $(configcall.args[2:end]...)) : ParallelStencil.ParallelKernel.get_ranges($(configcall.args[2:end]...))) parallel_call_memopt(caller, ranges, kernelcall, backend_kwargs_expr, async; memopt=memopt, configcall=configcall) end @@ -362,15 +368,16 @@ end ## FUNCTIONS TO DETERMINE OPTIMIZATION PARAMETERS +determine_nthreads_max_memopt(package::Symbol) = (package == PKG_AMDGPU) ? NTHREADS_MAX_MEMOPT_AMDGPU : NTHREADS_MAX_MEMOPT_CUDA determine_loopdim(indices::Union{Symbol,Expr}) = isa(indices,Expr) && (length(indices.args)==3) ? 3 : LOOPDIM_NONE # TODO: currently only loopdim=3 is supported. -compute_loopsize() = LOOPSIZE +compute_loopsize() = LOOPSIZE ## FUNCTIONS TO COMPUTE NTHREADS, NBLOCKS -function compute_nthreads_memopt(maxsize, loopdim, stencilranges) # This is a heuristic, which results typcially in (32,4,1) threads for a 3-D case. +function compute_nthreads_memopt(nthreads_x_max, nthreads_max_memopt, maxsize, loopdim, stencilranges) # This is a heuristic, which results typcially in (32,4,1) threads for a 3-D case. maxsize = promote_maxsize(maxsize) - nthreads = ParallelKernel.compute_nthreads(maxsize; nthreads_max=NTHREADS_MAX_LOOPOPT, flatdim=loopdim) + nthreads = ParallelKernel.compute_nthreads(maxsize; nthreads_x_max=nthreads_x_max, nthreads_max=nthreads_max_memopt, flatdim=loopdim) for stencilranges_A in values(stencilranges) haloextensions = ((length(stencilranges_A[1])-1)*(loopdim!=1), (length(stencilranges_A[2])-1)*(loopdim!=2), (length(stencilranges_A[3])-1)*(loopdim!=3)) if (2*prod(nthreads) < prod(nthreads .+ haloextensions)) @ArgumentError("@parallel : the automatic determination of nthreads is not possible for this case. Please specify `nthreads` and `nblocks`.") end # NOTE: this is a simple heuristic to compute compare the number of threads to the total number of cells including halo. @@ -380,10 +387,10 @@ function compute_nthreads_memopt(maxsize, loopdim, stencilranges) # This is a he return nthreads end -function get_ranges_memopt(loopdim, args...) +function get_ranges_memopt(nthreads_x_max, nthreads_max_memopt, loopdim, args...) ranges = ParallelKernel.get_ranges(args...) maxsize = length.(ranges) - nthreads = ParallelKernel.compute_nthreads(maxsize; nthreads_max=NTHREADS_MAX_LOOPOPT, flatdim=loopdim) + nthreads = ParallelKernel.compute_nthreads(maxsize; nthreads_x_max=nthreads_x_max, nthreads_max=nthreads_max_memopt, flatdim=loopdim) # TODO: the following code reduces performance from ~482 GB/s to ~478 GB/s rests = maxsize .% nthreads ranges_adjustment = ( (rests[1] != 0) ? (nthreads[1] - rests[1]) : 0, From 3b6fb9458daf5314d7ad9bc46610a7bb4a05c094 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 5 Jul 2024 16:49:28 +0200 Subject: [PATCH 5/7] introduce nthreads_max for amdgpu --- src/shared.jl | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/shared.jl b/src/shared.jl index 9bbd7293..0c91de2b 100644 --- a/src/shared.jl +++ b/src/shared.jl @@ -15,22 +15,23 @@ Return an expression that evaluates to `true` if the indices generated by @paral This macro is not intended for explicit manual usage. Calls to it are automatically added by @parallel where required. """ -const SUPPORTED_NDIMS = [1, 2, 3] -const NDIMS_NONE = 0 -const ERRMSG_KERNEL_UNSUPPORTED = "unsupported kernel statements in @parallel kernel definition: @parallel is only applicable to kernels that contain exclusively array assignments using macros from FiniteDifferences{1|2|3}D or from another compatible computation submodule. @parallel_indices supports any kind of statements in the kernels." -const ERRMSG_CHECK_NDIMS = "ndims must be evaluatable at parse time (e.g. literal or constant) and has to be one of the following Integers: $(join(SUPPORTED_NDIMS,", "))" -const ERRMSG_CHECK_MEMOPT = "memopt must be evaluatable at parse time (e.g. literal or constant) and has to be of type Bool." -const PSNumber = PKNumber -const LOOPSIZE = 16 -const LOOPDIM_NONE = 0 -const NTHREADS_MAX_LOOPOPT = 128 -const USE_SHMEMHALO_DEFAULT = true -const USE_SHMEMHALO_1D_DEFAULT = true -const USE_FULLRANGE_DEFAULT = (false, false, true) -const FULLRANGE_THRESHOLD = 1 -const NOEXPR = :(begin end) -const MOD_METADATA = :__metadata__ # gensym_world("__metadata__", @__MODULE__) # # TODO: name mangling should be used here later, or if there is any sense to leave it like that then at check whether it's available must be done before creating it -const META_FUNCTION_PREFIX = string(gensym_world("META", @__MODULE__)) +const SUPPORTED_NDIMS = [1, 2, 3] +const NDIMS_NONE = 0 +const ERRMSG_KERNEL_UNSUPPORTED = "unsupported kernel statements in @parallel kernel definition: @parallel is only applicable to kernels that contain exclusively array assignments using macros from FiniteDifferences{1|2|3}D or from another compatible computation submodule. @parallel_indices supports any kind of statements in the kernels." +const ERRMSG_CHECK_NDIMS = "ndims must be evaluatable at parse time (e.g. literal or constant) and has to be one of the following Integers: $(join(SUPPORTED_NDIMS,", "))" +const ERRMSG_CHECK_MEMOPT = "memopt must be evaluatable at parse time (e.g. literal or constant) and has to be of type Bool." +const PSNumber = PKNumber +const LOOPSIZE = 16 +const LOOPDIM_NONE = 0 +const NTHREADS_MAX_MEMOPT_CUDA = 128 +const NTHREADS_MAX_MEMOPT_AMDGPU = 256 +const USE_SHMEMHALO_DEFAULT = true +const USE_SHMEMHALO_1D_DEFAULT = true +const USE_FULLRANGE_DEFAULT = (false, false, true) +const FULLRANGE_THRESHOLD = 1 +const NOEXPR = :(begin end) +const MOD_METADATA = :__metadata__ # gensym_world("__metadata__", @__MODULE__) # # TODO: name mangling should be used here later, or if there is any sense to leave it like that then at check whether it's available must be done before creating it +const META_FUNCTION_PREFIX = string(gensym_world("META", @__MODULE__)) ## FUNCTIONS TO DEAL WITH KERNEL DEFINITIONS From b8be2d937fc534c1cefc30e610f8f352f0ef8fe4 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 5 Jul 2024 16:50:58 +0200 Subject: [PATCH 6/7] adjust unit tests for introduced nthreads max --- test/ParallelKernel/test_parallel.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/ParallelKernel/test_parallel.jl b/test/ParallelKernel/test_parallel.jl index d78390f8..38302b66 100644 --- a/test/ParallelKernel/test_parallel.jl +++ b/test/ParallelKernel/test_parallel.jl @@ -33,10 +33,10 @@ import Enzyme @testset "@parallel" begin @static if $package == $PKG_CUDA call = @prettystring(1, @parallel f(A)) - @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))))) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))) stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))", call) + @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 32)) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 32) stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))", call) @test occursin("CUDA.synchronize(CUDA.stream())", call) call = @prettystring(1, @parallel ranges f(A)) - @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)))) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges))) stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) + @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 32)) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 32) stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) call = @prettystring(1, @parallel nblocks nthreads f(A)) @test occursin("CUDA.@cuda blocks = nblocks threads = nthreads stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[3])))", call) call = @prettystring(1, @parallel ranges nblocks nthreads f(A)) @@ -45,10 +45,10 @@ import Enzyme @test occursin("CUDA.@cuda blocks = nblocks threads = nthreads stream = mystream f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[3])))", call) elseif $package == $PKG_AMDGPU call = @prettystring(1, @parallel f(A)) - @test occursin("AMDGPU.@roc gridsize = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))))) groupsize = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))) stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))", call) + @test occursin("AMDGPU.@roc gridsize = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 64)) groupsize = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 64) stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))", call) @test occursin("AMDGPU.synchronize(AMDGPU.stream())", call) call = @prettystring(1, @parallel ranges f(A)) - @test occursin("AMDGPU.@roc gridsize = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)))) groupsize = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges))) stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) + @test occursin("AMDGPU.@roc gridsize = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 64)) groupsize = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 64) stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) call = @prettystring(1, @parallel nblocks nthreads f(A)) @test occursin("AMDGPU.@roc gridsize = nblocks groupsize = nthreads stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[3])))", call) call = @prettystring(1, @parallel ranges nblocks nthreads f(A)) From fc489a750f2c3a042e8e35b0d8f54f61ac464f34 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 5 Jul 2024 16:51:11 +0200 Subject: [PATCH 7/7] adjust unit tests for introduced nthreads max --- test/test_parallel.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/test_parallel.jl b/test/test_parallel.jl index b4dd8a97..def2adce 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -25,13 +25,13 @@ import ParallelStencil.@gorgeousexpand @require !@is_initialized() @init_parallel_stencil($package, Float64, 3) @require @is_initialized() - @testset "@parallel " begin # NOTE: calls must go to ParallelStencil.ParallelKernel.parallel and must therefore give the same result as in ParallelKernel (tests copied 1-to-1 from there). + @testset "@parallel " begin # NOTE: calls must go to ParallelStencil.ParallelKernel.parallel and must therefore give the same result as in ParallelKernel, except for memopt tests (tests copied 1-to-1 from there). @static if $package == $PKG_CUDA call = @prettystring(1, @parallel f(A)) - @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))))) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))) stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))", call) + @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 32)) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 32) stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))", call) @test occursin("CUDA.synchronize(CUDA.stream())", call) call = @prettystring(1, @parallel ranges f(A)) - @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)))) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges))) stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) + @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 32)) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 32) stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) call = @prettystring(1, @parallel nblocks nthreads f(A)) @test occursin("CUDA.@cuda blocks = nblocks threads = nthreads stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[3])))", call) call = @prettystring(1, @parallel ranges nblocks nthreads f(A)) @@ -44,10 +44,10 @@ import ParallelStencil.@gorgeousexpand @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(cld.(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)),", call) # NOTE: now it is a very long multi line expression; before it continued as follows: (1, 1, 16)), ParallelStencil.compute_nthreads_memopt(cld.(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), (1, 1, 16)), 3, (-1:1, -1:1, -1:1))) threads = ParallelStencil.compute_nthreads_memopt(cld.(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), (1, 1, 16)), 3, (-1:1, -1:1, -1:1)) stream = CUDA.stream() shmem = ((ParallelStencil.compute_nthreads_memopt(cld.(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), (1, 1, 16)), 3, (-1:1, -1:1, -1:1)))[1] + 3) * ((ParallelStencil.compute_nthreads_memopt(cld.(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), (1, 1, 16)), 3, (-1:1, -1:1, -1:1)))[2] + 3) * sizeof(Float64) f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) elseif $package == $PKG_AMDGPU call = @prettystring(1, @parallel f(A)) - @test occursin("AMDGPU.@roc gridsize = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))))) groupsize = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))) stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))", call) + @test occursin("AMDGPU.@roc gridsize = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 64)) groupsize = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 64) stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))", call) @test occursin("AMDGPU.synchronize(AMDGPU.stream())", call) call = @prettystring(1, @parallel ranges f(A)) - @test occursin("AMDGPU.@roc gridsize = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)))) groupsize = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges))) stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) + @test occursin("AMDGPU.@roc gridsize = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 64)) groupsize = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 64) stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) call = @prettystring(1, @parallel nblocks nthreads f(A)) @test occursin("AMDGPU.@roc gridsize = nblocks groupsize = nthreads stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[3])))", call) call = @prettystring(1, @parallel ranges nblocks nthreads f(A))