Merge branch 'main' into fields

omlins · Oct 16, 2024 · 08067e7 · 08067e7
2 parents 6bf88f8 + 241f0c5
commit 08067e7
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 13 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "ParallelStencil"
 uuid = "94395366-693c-11ea-3b26-d9b7aac5d958"
 authors = ["Samuel Omlin", "Ludovic Räss"]
-version = "0.13.2"
+version = "0.13.6"
 
 [deps]
 CellArrays = "d35fcfd7-7af4-4c67-b1aa-d78070614af4"
@@ -21,10 +21,10 @@ ParallelStencil_CUDAExt = "CUDA"
 ParallelStencil_EnzymeExt = "Enzyme"
 
 [compat]
-AMDGPU = "0.6, 0.7, 0.8"
+AMDGPU = "0.6, 0.7, 0.8, 0.9, 1"
 CUDA = "3.12, 4, 5"
 CellArrays = "0.2.1"
-Enzyme = "0.11"
+Enzyme = "0.11, 0.12, 0.13"
 MacroTools = "0.5"
 Polyester = "0.7"
 StaticArrays = "1"

diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 <h1> <img src="docs/logo/logo_ParallelStencil.png" alt="ParallelStencil.jl" width="50"> ParallelStencil.jl </h1>
 
 [![Build Status](https://github.com/omlins/ParallelStencil.jl/workflows/CI/badge.svg)](https://github.com/omlins/ParallelStencil.jl/actions)
+[![DOI](https://proceedings.juliacon.org/papers/10.21105/jcon.00138/status.svg)](https://doi.org/10.21105/jcon.00138)
 
 ParallelStencil empowers domain scientists to write architecture-agnostic high-level code for parallel high-performance stencil computations on GPUs and CPUs. Performance similar to CUDA C / HIP can be achieved, which is typically a large improvement over the performance reached when using only [CUDA.jl] or [AMDGPU.jl] [GPU Array programming]. For example, a 2-D shallow ice solver presented at JuliaCon 2020 \[[1][JuliaCon20a]\] achieved a nearly 20 times better performance than a corresponding [GPU Array programming] implementation; in absolute terms, it reached 70% of the theoretical upper performance bound of the used Nvidia P100 GPU, as defined by the effective throughput metric, `T_eff` (note that `T_eff` is very different from common throughput metrics, see section [Performance metric](#performance-metric)). The GPU performance of the solver is reported in green, the CPU performance in blue:
 
@@ -85,7 +86,7 @@ Note that`@d2_yi` and `@d2_zi` perform the analogue operations as `@d2_xi` along
 Type `?FiniteDifferences3D` in the [Julia REPL] to explore all provided macros.
 
 ## 50-lines example deployable on GPU and CPU
-This concise 3-D heat diffusion solver uses ParallelStencil and a a simple boolean `USE_GPU` defines whether it runs on GPU or CPU (the environment variable [JULIA_NUM_THREADS] defines how many cores are used in the latter case):
+This concise 3-D heat diffusion solver uses ParallelStencil and a simple boolean `USE_GPU` defines whether it runs on GPU or CPU (the environment variable [JULIA_NUM_THREADS] defines how many cores are used in the latter case):
 
 ```julia
 const USE_GPU = true
@@ -276,7 +277,7 @@ It can be launched as follows:
 Furthermore, a set of architecture-agnostic low level kernel language constructs is supported in these `@parallel_indices` kernels (see in [Module documentation callable from the Julia REPL / IJulia](#module-documentation-callable-from-the-julia-repl--ijulia)). They enable, e.g., explicit usage of shared memory (see [this 2-D heat diffusion example](/examples/diffusion2D_shmem_novis.jl)).
 
 ## Support for logical arrays of small arrays / structs
-Logical arrays of small arrays / structs enabling optimized data access can be conveniently created with the architecture-agnostic allocation macros earlier introduced (see [Parallelization and optimization with one macro call]). To this purpose, ParallelStencil leverages `CellArray`s (from [CellArrays.jl], which relies in turn on [StaticArrays.jl]). To create a logical array of small arrays, it is sufficient to pass to any of these allocation macros the keyword `celldims` with the dimensions of the inner arrays, e.g.:
+Logical arrays of small arrays / structs enabling optimized data access can be conveniently created with the architecture-agnostic allocation macros earlier introduced (see [Parallelization and optimization with one macro call](#parallelization-with-one-macro-call)). To this purpose, ParallelStencil leverages `CellArray`s (from [CellArrays.jl], which relies in turn on [StaticArrays.jl]). To create a logical array of small arrays, it is sufficient to pass to any of these allocation macros the keyword `celldims` with the dimensions of the inner arrays, e.g.:
 ```julia
 nx, ny, nz = 128, 128, 128
 celldims   = (4, 4)
@@ -522,7 +523,7 @@ To discuss technical issues, please post on Julia Discourse in the [GPU topic] o
 To discuss numerical/domain-science issues, please post on Julia Discourse in the [Numerics topic] or the [Modelling & Simulations topic] or whichever other topic fits best your issue.
 
 ## Your contributions
-This project welcomes your contribution! Have you developed an application with ParallelStencil that could be featured as a mini-app? Please contribute it to share it with the world! Would you like to use other methods than finite differences with math-close notation in ParallelStencil kernels? Then check out the tiny `ParallelStencil.FiniteDifferences1D` submodule as an example for enabling math-close notation for a method and contribute your own submodule! Are you missing a great feature in the core of ParallelStencil? Maybe you can contribute yourself! 
+This project welcomes your contribution! Have you developed an application with ParallelStencil that could be featured as a mini-app? Please contribute it to share it with the world! Would you like to use other methods than finite differences with math-close notation in ParallelStencil kernels? Then check out the tiny `ParallelStencil.FiniteDifferences1D` submodule as an example for enabling math-close notation for a method and contribute your own submodule! Are you missing a great feature in the core of ParallelStencil? Maybe you can contribute yourself!
 Please open an issue to discuss your idea for a contribution beforehand. Furthermore, note that a pull request should always address a significant issue in its completeness. Moreover, pull requests should blend nicely into the existing project; common sense is the primary guide in this regard (community guideline documents, e.g. [ColPrac](https://github.com/SciML/ColPrac), can be consulted in addition for inspiration). We are looking forward to your contribution!
 
 ## References
@@ -545,7 +546,7 @@ Please open an issue to discuss your idea for a contribution beforehand. Further
 [JuliaCon20a]: https://www.youtube.com/watch?v=vPsfZUqI4_0
 [JuliaCon20b]: https://www.youtube.com/watch?v=1t1AKnnGRqA
 [JuliaCon19]: https://www.youtube.com/watch?v=b90qqbYJ58Q
-[PASC19]: https://pasc19.pasc-conference.org/program/schedule/presentation/?id=msa218&sess=sess144
+[PASC19]: https://pasc19.pasc-conference.org/program/schedule/index.html%3Fpost_type=page&p=10&id=msa218&sess=sess144.html
 [Base.Threads]: https://docs.julialang.org/en/v1/base/multi-threading/
 [ImplicitGlobalGrid.jl]: https://github.com/eth-cscs/ImplicitGlobalGrid.jl
 [JULIA_NUM_THREADS]:https://docs.julialang.org/en/v1.0.0/manual/environment-variables/#JULIA_NUM_THREADS-1

diff --git a/src/ParallelKernel/EnzymeExt/autodiff_gpu.jl b/src/ParallelKernel/EnzymeExt/autodiff_gpu.jl
@@ -2,18 +2,36 @@ import ParallelStencil
 import ParallelStencil: PKG_THREADS, PKG_POLYESTER
 import Enzyme
 
-function ParallelStencil.ParallelKernel.AD.init_AD(package::Symbol)
-    if iscpu(package)
-        Enzyme.API.runtimeActivity!(true) # NOTE: this is currently required for Enzyme to work correctly with threads
+# function ParallelStencil.ParallelKernel.AD.init_AD(package::Symbol)
+#     if iscpu(package)
+#         Enzyme.API.runtimeActivity!(true) # NOTE: this is currently required for Enzyme to work correctly with threads
+#     end
+# end
+
+# ParallelStencil injects a configuration parameter at the end, for Enzyme we need to wrap that parameter as a Annotation
+# for all purposes this ought to be Const. This is not ideal since we might accidentially wrap other parameters the user
+# provided as well. This is needed to support @parallel autodiff_deferred(...)
+ function promote_to_const(args::Vararg{Any,N}) where N
+    ntuple(Val(N)) do i
+        @inline
+        if !(args[i] isa Enzyme.Annotation ||
+            (args[i] isa UnionAll && args[i] <: Enzyme.Annotation) || # Const
+            (args[i] isa DataType && args[i] <: Enzyme.Annotation)) # Const{Nothing}
+            return Enzyme.Const(args[i])
+        else
+            return args[i]
+        end
     end
 end
 
 function ParallelStencil.ParallelKernel.AD.autodiff_deferred!(arg, args...) # NOTE: minimal specialization is used to avoid overwriting the default method
+    args = promote_to_const(args...)
     Enzyme.autodiff_deferred(arg, args...)
     return
 end
 
 function ParallelStencil.ParallelKernel.AD.autodiff_deferred_thunk!(arg, args...) # NOTE: minimal specialization is used to avoid overwriting the default method
+    args = promote_to_const(args...)
     Enzyme.autodiff_deferred_thunk(arg, args...)
     return
 end

diff --git a/test/ParallelKernel/test_parallel.jl b/test/ParallelKernel/test_parallel.jl
@@ -92,7 +92,7 @@ import Enzyme
                 @test @prettystring(1, @parallel ∇=(V.x->V̄.x, V.y->V̄.y) f!(V.x, V.y, a)) == "@parallel configcall = f!(V.x, V.y, a) ParallelStencil.ParallelKernel.AD.autodiff_deferred!(Enzyme.Reverse, f!, Enzyme.DuplicatedNoNeed(V.x, V̄.x), Enzyme.DuplicatedNoNeed(V.y, V̄.y), Enzyme.Const(a))"
             end;
             @testset "AD.autodiff_deferred!" begin
-                @static if $package == $PKG_THREADS
+                @static if $package == $PKG_THREADS && VERSION.minor < 11 # TODO: remove restriction to Julia version < 1.11 once Enzyme support is available.
                     N = 16
                     a = 6.5
                     A = @rand(N)
@@ -113,8 +113,8 @@ import Enzyme
                         end
                         return
                     end
-                    @parallel configcall=f!(A, B, a) AD.autodiff_deferred!(Enzyme.Reverse, f!, DuplicatedNoNeed(A, Ā), DuplicatedNoNeed(B, B̄), Const(a))
-                    Enzyme.autodiff_deferred(Enzyme.Reverse, g!, DuplicatedNoNeed(A_ref, Ā_ref), DuplicatedNoNeed(B_ref, B̄_ref), Const(a))
+                    @parallel configcall=f!(A, B, a) AD.autodiff_deferred!(Enzyme.Reverse, Const(f!), Const, DuplicatedNoNeed(A, Ā), DuplicatedNoNeed(B, B̄), Const(a))
+                    Enzyme.autodiff_deferred(Enzyme.Reverse, Const(g!),Const, DuplicatedNoNeed(A_ref, Ā_ref), DuplicatedNoNeed(B_ref, B̄_ref), Const(a))
                     @test Array(Ā) ≈ Ā_ref
                     @test Array(B̄) ≈ B̄_ref
                 end