From 8df6a705e3161eaa0a486f864fbcf21807034580 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 12 Jun 2024 10:42:41 +0000 Subject: [PATCH] Deploy to GitHub pages --- .buildinfo | 4 + .nojekyll | 0 _images/assembly.svg | 3364 ++++++++++++++++++ _images/csr.svg | 1770 ++++++++++ _images/direct_arg.svg | 330 ++ _images/indirect_arg.svg | 833 +++++ _images/indirect_arg_flattened.svg | 832 +++++ _images/iteration_spaces.svg | 5040 +++++++++++++++++++++++++++ _images/mixed_assembly.svg | 3703 ++++++++++++++++++++ _images/mixed_sparsity.svg | 602 ++++ _images/mpi_matrix.svg | 297 ++ _images/pyop2_architecture.svg | 890 +++++ _images/pyop2_colouring.svg | 2370 +++++++++++++ _images/pyop2_device_data_state.svg | 529 +++ _images/pyop2_mpi_mesh.svg | 2267 ++++++++++++ _sources/architecture.rst.txt | 76 + _sources/backends.rst.txt | 457 +++ _sources/caching.rst.txt | 112 + _sources/concepts.rst.txt | 268 ++ _sources/index.rst.txt | 44 + _sources/installation.rst.txt | 20 + _sources/ir.rst.txt | 324 ++ _sources/kernels.rst.txt | 234 ++ _sources/linear_algebra.rst.txt | 304 ++ _sources/mixed.rst.txt | 144 + _sources/mpi.rst.txt | 125 + _sources/plan.rst.txt | 80 + _sources/profiling.rst.txt | 170 + _sources/pyop2.codegen.rst.txt | 61 + _sources/pyop2.rst.txt | 142 + _sources/pyop2.types.rst.txt | 85 + _sources/user.rst.txt | 68 + _static/basic.css | 925 +++++ _static/classic.css | 269 ++ _static/default.css | 1 + _static/doctools.js | 156 + _static/documentation_options.js | 13 + _static/file.png | Bin 0 -> 286 bytes _static/language_data.js | 199 ++ _static/minus.png | Bin 0 -> 90 bytes _static/plus.png | Bin 0 -> 90 bytes _static/pygments.css | 75 + _static/searchtools.js | 619 ++++ _static/sidebar.js | 70 + _static/sphinx_highlight.js | 154 + architecture.html | 184 + backends.html | 560 +++ caching.html | 221 ++ concepts.html | 363 ++ genindex.html | 1150 ++++++ index.html | 224 ++ installation.html | 128 + ir.html | 412 +++ kernels.html | 326 ++ linear_algebra.html | 387 ++ mixed.html | 250 ++ mpi.html | 234 ++ objects.inv | Bin 0 -> 3018 bytes plan.html | 187 + profiling.html | 287 ++ py-modindex.html | 173 + pyop2.codegen.html | 155 + pyop2.html | 1172 +++++++ pyop2.types.html | 2472 +++++++++++++ search.html | 100 + searchindex.js | 1 + user.html | 125 + 67 files changed, 37137 insertions(+) create mode 100644 .buildinfo create mode 100644 .nojekyll create mode 100644 _images/assembly.svg create mode 100644 _images/csr.svg create mode 100644 _images/direct_arg.svg create mode 100644 _images/indirect_arg.svg create mode 100644 _images/indirect_arg_flattened.svg create mode 100644 _images/iteration_spaces.svg create mode 100644 _images/mixed_assembly.svg create mode 100644 _images/mixed_sparsity.svg create mode 100644 _images/mpi_matrix.svg create mode 100644 _images/pyop2_architecture.svg create mode 100644 _images/pyop2_colouring.svg create mode 100644 _images/pyop2_device_data_state.svg create mode 100644 _images/pyop2_mpi_mesh.svg create mode 100644 _sources/architecture.rst.txt create mode 100644 _sources/backends.rst.txt create mode 100644 _sources/caching.rst.txt create mode 100644 _sources/concepts.rst.txt create mode 100644 _sources/index.rst.txt create mode 100644 _sources/installation.rst.txt create mode 100644 _sources/ir.rst.txt create mode 100644 _sources/kernels.rst.txt create mode 100644 _sources/linear_algebra.rst.txt create mode 100644 _sources/mixed.rst.txt create mode 100644 _sources/mpi.rst.txt create mode 100644 _sources/plan.rst.txt create mode 100644 _sources/profiling.rst.txt create mode 100644 _sources/pyop2.codegen.rst.txt create mode 100644 _sources/pyop2.rst.txt create mode 100644 _sources/pyop2.types.rst.txt create mode 100644 _sources/user.rst.txt create mode 100644 _static/basic.css create mode 100644 _static/classic.css create mode 100644 _static/default.css create mode 100644 _static/doctools.js create mode 100644 _static/documentation_options.js create mode 100644 _static/file.png create mode 100644 _static/language_data.js create mode 100644 _static/minus.png create mode 100644 _static/plus.png create mode 100644 _static/pygments.css create mode 100644 _static/searchtools.js create mode 100644 _static/sidebar.js create mode 100644 _static/sphinx_highlight.js create mode 100644 architecture.html create mode 100644 backends.html create mode 100644 caching.html create mode 100644 concepts.html create mode 100644 genindex.html create mode 100644 index.html create mode 100644 installation.html create mode 100644 ir.html create mode 100644 kernels.html create mode 100644 linear_algebra.html create mode 100644 mixed.html create mode 100644 mpi.html create mode 100644 objects.inv create mode 100644 plan.html create mode 100644 profiling.html create mode 100644 py-modindex.html create mode 100644 pyop2.codegen.html create mode 100644 pyop2.html create mode 100644 pyop2.types.html create mode 100644 search.html create mode 100644 searchindex.js create mode 100644 user.html diff --git a/.buildinfo b/.buildinfo new file mode 100644 index 000000000..d94ac7a8e --- /dev/null +++ b/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: a1951cdb1ea774dd1337146a7e168917 +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 000000000..e69de29bb diff --git a/_images/assembly.svg b/_images/assembly.svg new file mode 100644 index 000000000..5c87b8d89 --- /dev/null +++ b/_images/assembly.svg @@ -0,0 +1,3364 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_images/csr.svg b/_images/csr.svg new file mode 100644 index 000000000..b9e736a71 --- /dev/null +++ b/_images/csr.svg @@ -0,0 +1,1770 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 10 + 3 + 3 + 8 + 9 + 7 + 8 + 7 + 0 + -2 + 8 + 7 + 5 + 9 + 13 + Sparse Matrix + + + + + + + + + + + + + + + + + + + 10 + -2 + 3 + 9 + 7 + 8 + 7 + 3 + 8 + 7 + 5 + 8 + 9 + 13 + Values array + + + + + + + + + + + + + + + + + + + 0 + 4 + 0 + 1 + 1 + 2 + 3 + 0 + 2 + 3 + 4 + 1 + 3 + 4 + Column indices array + + + + + + + + + + + 0 + 2 + 4 + 7 + 11 + 14 + Row pointer array + + diff --git a/_images/direct_arg.svg b/_images/direct_arg.svg new file mode 100644 index 000000000..7817f3228 --- /dev/null +++ b/_images/direct_arg.svg @@ -0,0 +1,330 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + (dim 2) + + + argument Dat + iteration Set + i + i+1 + 2i + 2i+1 + + diff --git a/_images/indirect_arg.svg b/_images/indirect_arg.svg new file mode 100644 index 000000000..ff737c2e9 --- /dev/null +++ b/_images/indirect_arg.svg @@ -0,0 +1,833 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + argument Dat + iteration Set + i + 3i + 3i+1 + 3i+2 + 2m[i,0] + 2m[i,1] + 2m[i,2] + argument Map + (arity 3) + (dim 2) + kernel Arg + + + + + + + + + + + diff --git a/_images/indirect_arg_flattened.svg b/_images/indirect_arg_flattened.svg new file mode 100644 index 000000000..2da6cbe8f --- /dev/null +++ b/_images/indirect_arg_flattened.svg @@ -0,0 +1,832 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + argument Dat + iteration Set + i + 3i + 3i+1 + 3i+2 + 2m[i,0] + 2m[i,1] + 2m[i,2] + argument Map + (arity 3) + (dim 2) + kernel Arg + + + + + + + + + + + + + + + + (flattened) + + diff --git a/_images/iteration_spaces.svg b/_images/iteration_spaces.svg new file mode 100644 index 000000000..9029c95cd --- /dev/null +++ b/_images/iteration_spaces.svg @@ -0,0 +1,5040 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Unified iteration space:144 kernel output values computed by single thread + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0,0 + 0,11 + Local iteration space: 144 kernel output values computedby 144 threads (0,0) ... (0,11) (1,0) ... (1,11) ... (11,0) ... (11,11) + 0,1 + 0,2 + 0,3 + 0,4 + 0,5 + 0,6 + 0,7 + 0,8 + 0,9 + 0,10 + 1,0 + 1,11 + 1,1 + 1,2 + 1,3 + 1,4 + 1,5 + 1,6 + 1,7 + 1,8 + 1,9 + 1,10 + 2,0 + 2,11 + 2,1 + 2,2 + 2,3 + 2,4 + 2,5 + 2,6 + 2,7 + 2,8 + 2,9 + 2,10 + 3,0 + 3,11 + 3,1 + 3,2 + 3,3 + 3,4 + 3,5 + 3,6 + 3,7 + 3,8 + 3,9 + 3,10 + 4,0 + 4,11 + 4,1 + 4,2 + 4,3 + 4,4 + 4,5 + 4,6 + 4,7 + 4,8 + 4,9 + 4,10 + 5,0 + 5,11 + 5,1 + 5,2 + 5,3 + 5,4 + 5,5 + 5,6 + 5,7 + 5,8 + 5,9 + 5,10 + 6,0 + 6,11 + 6,1 + 6,2 + 6,3 + 6,4 + 6,5 + 6,6 + 6,7 + 6,8 + 6,9 + 6,10 + 7,0 + 7,11 + 7,1 + 7,2 + 7,3 + 7,4 + 7,5 + 7,6 + 7,7 + 7,8 + 7,9 + 7,10 + 8,0 + 8,11 + 8,1 + 8,2 + 8,3 + 8,4 + 8,5 + 8,6 + 8,7 + 8,8 + 8,9 + 8,10 + 9,11 + 9,1 + 9,2 + 9,3 + 9,4 + 9,5 + 9,6 + 9,7 + 9,8 + 9,9 + 9,10 + 9,0 + 10,0 + 10,11 + 10,1 + 10,2 + 10,3 + 10,4 + 10,5 + 10,6 + 10,7 + 10,8 + 10,9 + 10,10 + 11,0 + 11,11 + 11,1 + 11,2 + 11,3 + 11,4 + 11,5 + 11,6 + 11,7 + 11,8 + 11,9 + 11,10 + + diff --git a/_images/mixed_assembly.svg b/_images/mixed_assembly.svg new file mode 100644 index 000000000..94f08d5c0 --- /dev/null +++ b/_images/mixed_assembly.svg @@ -0,0 +1,3703 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_images/mixed_sparsity.svg b/_images/mixed_sparsity.svg new file mode 100644 index 000000000..ae9d71e13 --- /dev/null +++ b/_images/mixed_sparsity.svg @@ -0,0 +1,602 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + 0,0 + 0,1 + 1,0 + 1,1 + Mapr,0 + Mapc,1 + Mapr,0 + Mapc,0 + Mapr,1 + Mapc,0 + Mapr,1 + Mapc,1 + + + + + + + + + DataSetc,0 + DataSetc,1 + DataSetr,0 + DataSetr,1 + Setit,0 + Mapc,0 + Mapc,1 + Mapr,0 + Mapr,1 + + + + + + + + diff --git a/_images/mpi_matrix.svg b/_images/mpi_matrix.svg new file mode 100644 index 000000000..a305ba41c --- /dev/null +++ b/_images/mpi_matrix.svg @@ -0,0 +1,297 @@ + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + offdiagonal + offdiagonal + + + diagonal + diagonal + diagonal + off-diagonal + off-diagonal + + + 0 + 1 + 2 + + diff --git a/_images/pyop2_architecture.svg b/_images/pyop2_architecture.svg new file mode 100644 index 000000000..eb33a5a03 --- /dev/null +++ b/_images/pyop2_architecture.svg @@ -0,0 +1,890 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + OpenCL + CUDA + + + + + + CPU compiler + PyOpenCL + PyCUDA + CPU OpenMP + CPU seq. + MPI + + + + PyOP2 Lib & Runtime Corecolouring, parallel scheduling + + + + Lin. algebraPETSc/Cusp + + + + + + + Kernels + Data + AccessDescriptors + Application code + + + + + + + + + + + + + + + + + + + + + Backends + Code generation + PyOP2 core + User code + + diff --git a/_images/pyop2_colouring.svg b/_images/pyop2_colouring.svg new file mode 100644 index 000000000..0544909ac --- /dev/null +++ b/_images/pyop2_colouring.svg @@ -0,0 +1,2370 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + edges + shared / stagingmemory + vertices + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_images/pyop2_device_data_state.svg b/_images/pyop2_device_data_state.svg new file mode 100644 index 000000000..c85170146 --- /dev/null +++ b/_images/pyop2_device_data_state.svg @@ -0,0 +1,529 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + Deviceunallocated + + Device + + Both + + Host + + + + + + + + + + + + + + allocate_device() + par_loop(write) + par_loop(write) + par_loop(write) + par_loop (read) + to_device() + access data + accessdata_ro + from_device() + accessdata + par_loop(read) + + diff --git a/_images/pyop2_mpi_mesh.svg b/_images/pyop2_mpi_mesh.svg new file mode 100644 index 000000000..51d2636f1 --- /dev/null +++ b/_images/pyop2_mpi_mesh.svg @@ -0,0 +1,2267 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + processor 0 + processor 1 + core + owned + exec + non-exec + core + owned + exec + non-exec + + + halos + + diff --git a/_sources/architecture.rst.txt b/_sources/architecture.rst.txt new file mode 100644 index 000000000..f14a6da10 --- /dev/null +++ b/_sources/architecture.rst.txt @@ -0,0 +1,76 @@ +.. _architecture: + +PyOP2 Architecture +================== + +As described in :ref:`concepts`, PyOP2 exposes an API that allows users to +declare the topology of unstructured meshes in the form of :class:`Sets +` and :class:`Maps ` and data in the form of +:class:`Dats `, :class:`Mats `, :class:`Globals +` and :class:`Consts `. Computations on this data +are described by :class:`Kernels ` described in :ref:`kernels` +and executed by :func:`parallel loops `. + +The API is the frontend to the PyOP2 runtime compilation architecture, which +supports the generation and just-in-time (JIT) compilation of low-level code +for a range of backends described in :doc:`backends` and the efficient +scheduling of parallel computations. A schematic overview of the PyOP2 +architecture is given below: + +.. figure:: images/pyop2_architecture.svg + :align: center + + Schematic overview of the PyOP2 architecture + +From an outside perspective, PyOP2 is a conventional Python library, with +performance critical library functions implemented in Cython_. A user's +application code makes calls to the PyOP2 API, most of which are conventional +library calls. The exception are :func:`~pyop2.par_loop` calls, which +encapsulate PyOP2's runtime core functionality performing backend-specific +code generation. Executing a parallel loop comprises the following steps: + +1. Compute a parallel execution plan, including information for efficient + staging of data and partitioning and colouring of the iteration set for + conflict-free parallel execution. This process is described in :doc:`plan` + and does not apply to the sequential backend. +2. Generate backend-specific code for executing the computation for a given + set of :func:`~pyop2.par_loop` arguments as detailed in :doc:`backends` + according to the execution plan computed in the previous step. +3. Pass the generated code to a backend-specific toolchain for just-in-time + compilation, producing a shared library callable as a Python module which + is dynamically loaded. This module is cached on disk to save recompilation + when the same :func:`~pyop2.par_loop` is called again for the same backend. +4. Build the backend-specific list of arguments to be passed to the generated + code, which may initiate host to device data transfer for the CUDA and + OpenCL backends. +5. Call into the generated module to perform the actual computation. For + distributed parallel computations this involves separate calls for the + regions owned by the current processor and the halo as described in + :doc:`mpi`. +6. Perform any necessary reductions for :class:`Globals `. +7. Call the backend-specific matrix assembly procedure on any + :class:`~pyop2.Mat` arguments. + +.. _backend-support: + +Multiple Backend Support +------------------------ + +The backend is selected by passing the keyword argument ``backend`` to the +:func:`~pyop2.init` function. If omitted, the ``sequential`` backend is +selected by default. This choice can be overridden by exporting the +environment variable ``PYOP2_BACKEND``, which allows switching backends +without having to touch the code. Once chosen, the backend cannot be changed +for the duration of the running Python interpreter session. + +PyOP2 provides a single API to the user, regardless of which backend the +computations are running on. All classes and functions that form the public +API defined in :mod:`pyop2.op2` are interfaces, whose concrete implementations +are initialised according to the chosen backend. A metaclass takes care of +instantiating a backend-specific version of the requested class and setting +the corresponding docstrings such that this process is entirely transparent to +the user. The implementation of the PyOP2 backends is completely orthogonal to +the backend selection process and free to use established practices of +object-oriented design. + +.. _Cython: http://cython.org diff --git a/_sources/backends.rst.txt b/_sources/backends.rst.txt new file mode 100644 index 000000000..189e4cf60 --- /dev/null +++ b/_sources/backends.rst.txt @@ -0,0 +1,457 @@ +.. _backends: + +PyOP2 Backends +============== + +PyOP2 provides a number of different backends to be able to run parallel +computations on different hardware architectures. The currently supported +backends are + +* ``sequential``: runs sequentially on a single CPU core. +* ``openmp``: runs multiple threads on an SMP CPU using OpenMP. The number of + threads is set with the environment variable ``OMP_NUM_THREADS``. +* ``cuda``: offloads computation to a NVIDA GPU (requires :ref:`CUDA and pycuda + `) +* ``opencl``: offloads computation to an OpenCL device, either a multi-core + CPU or a GPU (requires :ref:`OpenCL and pyopencl `) + +Distributed parallel computations using MPI are supported by PyOP2 and +described in detail in :doc:`mpi`. Datastructures must be partitioned among +MPI processes with overlapping regions, so called halos. The host backends +``sequential`` and ``openmp`` have full MPI support, the device backends +``cuda`` and ``opencl`` only support parallel loops on :class:`Dats +`. Hybrid parallel computations with OpenMP are possible, where +``OMP_NUM_THREADS`` threads are launched per MPI rank. + +.. _host_backends: + +Host backends +------------- + +Any computation in PyOP2 requires the generation of code at runtime specific +to each individual :func:`~pyop2.par_loop`. The host backends generate code +which is just-in-time (JIT) compiled into a shared library callable +via :mod:`ctypes`. The compilation procedure also takes care of +caching the compiled library on disk, such that the compilation cost +is not paid every time. + +.. _sequential_backend: + +Sequential backend +~~~~~~~~~~~~~~~~~~ + +Since there is no parallel computation for the sequential backend, the +generated code is a C wrapper function with a ``for`` loop calling the kernel +for the respective :func:`~pyop2.par_loop`. This wrapper also takes care of +staging in and out the data as requested by the access descriptors requested +in the parallel loop. Both the kernel and the wrapper function are +just-in-time compiled in a single compilation unit such that the kernel call +can be inlined and does not incur any function call overhead. + +Recall the :func:`~pyop2.par_loop` calling the ``midpoint`` kernel from +:doc:`kernels`: :: + + op2.par_loop(midpoint, cells, + midpoints(op2.WRITE), + coordinates(op2.READ, cell2vertex)) + +.. highlight:: c + :linenothreshold: 5 + +The JIT compiled code for this loop is the kernel followed by the generated +wrapper code: :: + + inline void midpoint(double p[2], double *coords[2]) { + p[0] = (coords[0][0] + coords[1][0] + coords[2][0]) / 3.0; + p[1] = (coords[0][1] + coords[1][1] + coords[2][1]) / 3.0; + } + + void wrap_midpoint__(PyObject *_start, PyObject *_end, + PyObject *_arg0_0, + PyObject *_arg1_0, PyObject *_arg1_0_map0_0) { + int start = (int)PyInt_AsLong(_start); + int end = (int)PyInt_AsLong(_end); + double *arg0_0 = (double *)(((PyArrayObject *)_arg0_0)->data); + double *arg1_0 = (double *)(((PyArrayObject *)_arg1_0)->data); + int *arg1_0_map0_0 = (int *)(((PyArrayObject *)_arg1_0_map0_0)->data); + double *arg1_0_vec[3]; + for ( int n = start; n < end; n++ ) { + int i = n; + arg1_0_vec[0] = arg1_0 + arg1_0_map0_0[i * 3 + 0] * 2; + arg1_0_vec[1] = arg1_0 + arg1_0_map0_0[i * 3 + 1] * 2; + arg1_0_vec[2] = arg1_0 + arg1_0_map0_0[i * 3 + 2] * 2; + midpoint(arg0_0 + i * 2, arg1_0_vec); + } + } + +Note that the wrapper function is called directly from Python and therefore +all arguments are plain Python objects, which first need to be unwrapped. The +arguments ``_start`` and ``_end`` define the iteration set indices to iterate +over. The remaining arguments are :class:`arrays ` +corresponding to a :class:`~pyop2.Dat` or :class:`~pyop2.Map` passed to the +:func:`~pyop2.par_loop`. Arguments are consecutively numbered to avoid name +clashes. + +The first :func:`~pyop2.par_loop` argument ``midpoints`` is direct and +therefore no corresponding :class:`~pyop2.Map` is passed to the wrapper +function and the data pointer is passed straight to the kernel with an +appropriate offset. The second argument ``coordinates`` is indirect and hence +a :class:`~pyop2.Dat`-:class:`~pyop2.Map` pair is passed. Pointers to the data +are gathered via the :class:`~pyop2.Map` of arity 3 and staged in the array +``arg1_0_vec``, which is passed to the kernel. The coordinate data can +therefore be accessed in the kernel via double indirection with the +:class:`~pyop2.Map` already applied. Note that for both arguments, the +pointers are to two consecutive double values, since the +:class:`~pyop2.DataSet` is of dimension two in either case. + +.. _openmp_backend: + +OpenMP backend +~~~~~~~~~~~~~~ + +In contrast to the sequential backend, the outermost ``for`` loop in the +OpenMP backend is annotated with OpenMP pragmas to execute in parallel with +multiple threads. To avoid race conditions on data access, the iteration set +is coloured and a thread safe execution plan is computed as described in +:ref:`plan-colouring`. + +The JIT compiled code for the parallel loop from above changes as follows: :: + + void wrap_midpoint__(PyObject* _boffset, + PyObject* _nblocks, + PyObject* _blkmap, + PyObject* _offset, + PyObject* _nelems, + PyObject *_arg0_0, + PyObject *_arg1_0, PyObject *_arg1_0_map0_0) { + int boffset = (int)PyInt_AsLong(_boffset); + int nblocks = (int)PyInt_AsLong(_nblocks); + int* blkmap = (int *)(((PyArrayObject *)_blkmap)->data); + int* offset = (int *)(((PyArrayObject *)_offset)->data); + int* nelems = (int *)(((PyArrayObject *)_nelems)->data); + double *arg0_0 = (double *)(((PyArrayObject *)_arg0_0)->data); + double *arg1_0 = (double *)(((PyArrayObject *)_arg1_0)->data); + int *arg1_0_map0_0 = (int *)(((PyArrayObject *)_arg1_0_map0_0)->data); + double *arg1_0_vec[32][3]; + #ifdef _OPENMP + int nthread = omp_get_max_threads(); + #else + int nthread = 1; + #endif + #pragma omp parallel shared(boffset, nblocks, nelems, blkmap) + { + int tid = omp_get_thread_num(); + #pragma omp for schedule(static) + for (int __b = boffset; __b < boffset + nblocks; __b++) + { + int bid = blkmap[__b]; + int nelem = nelems[bid]; + int efirst = offset[bid]; + for (int n = efirst; n < efirst+ nelem; n++ ) + { + int i = n; + arg1_0_vec[tid][0] = arg1_0 + arg1_0_map0_0[i * 3 + 0] * 2; + arg1_0_vec[tid][1] = arg1_0 + arg1_0_map0_0[i * 3 + 1] * 2; + arg1_0_vec[tid][2] = arg1_0 + arg1_0_map0_0[i * 3 + 2] * 2; + midpoint(arg0_0 + i * 2, arg1_0_vec[tid]); + } + } + } + } + +Computation is split into ``nblocks`` blocks which start at an initial offset +``boffset`` and correspond to colours that can be executed conflict free in +parallel. This loop over colours is therefore wrapped in an OpenMP parallel +region and is annotated with an ``omp for`` pragma. The block id ``bid`` for +each of these blocks is given by the block map ``blkmap`` and is the index +into the arrays ``nelems`` and ``offset`` provided as part of the execution +plan. These are the number of elements that are part of the given block and +its starting index. Note that each thread needs its own staging array +``arg1_0_vec``, which is therefore scoped by the thread id. + +.. _device_backends: + +Device backends +--------------- + +As with the host backends, the device backends have most of the implementation +in common. The PyOP2 data carriers :class:`~pyop2.Dat`, :class:`~pyop2.Global` +and :class:`~pyop2.Const` have a data array in host memory and a separate +array in device memory. Flags indicate the present state of a given data +carrier: + +* ``DEVICE_UNALLOCATED``: no data is allocated on the device +* ``HOST_UNALLOCATED``: no data is allocated on the host +* ``DEVICE``: data is up-to-date (valid) on the device, but invalid on the + host +* ``HOST``: data is up-to-date (valid) on the host, but invalid on the device +* ``BOTH``: data is up-to-date (valid) on both the host and device + +When a :func:`~pyop2.par_loop` is called, PyOP2 uses the +:ref:`access-descriptors` to determine which data needs to be allocated or +transferred from host to device prior to launching the kernel. Data is only +transferred if it is out of date at the target location and all data transfer +is triggered lazily i.e. the actual copy only occurs once the data is +requested. In particular there is no automatic transfer back of data from +device to host unless it is accessed on the host. + +A newly created device :class:`~pyop2.Dat` has no associated device data and +starts out in the state ``DEVICE_UNALLOCATED``. The diagram below shows all +actions that involve a state transition, which can be divided into three +groups: calling explicit data transfer functions (red), access data on the +host (black) and using the :class:`~pyop2.Dat` in a :func:`~pyop2.par_loop` +(blue). There is no need for users to explicitly initiate data transfers and +the tranfer functions are only given for completeness. + +.. figure:: images/pyop2_device_data_state.svg + :align: center + + State transitions of a data carrier on PyOP2 device backends + +When a device :class:`~pyop2.Dat` is used in a :func:`~pyop2.par_loop` for the +first time, data is allocated on the device. If the :class:`~pyop2.Dat` is +only read, the host array is transferred to device if it was in state ``HOST`` +or ``DEVICE_UNALLOCATED`` before the :func:`~pyop2.par_loop` and the +:class:`~pyop2.Dat` is in the state ``BOTH`` afterwards, unless it was in +state ``DEVICE`` in which case it remains in that state. If the +:class:`~pyop2.Dat` is written to, data transfer before the +:func:`~pyop2.par_loop` is necessary unless the access descriptor is +:data:`~pyop2.WRITE` and the host data is out of date afterwards and the +:class:`~pyop2.Dat` is in the state ``DEVICE``. An overview of the state +transitions and necessary memory allocations and data transfers for the two +cases is given in the table below: + +====================== ============================== ================================================== +Initial state :func:`~pyop2.par_loop` read :func:`~pyop2.par_loop` written to +====================== ============================== ================================================== +``DEVICE_UNALLOCATED`` ``BOTH`` (alloc, transfer h2d) ``DEVICE`` (alloc, transfer h2d unless write-only) +``DEVICE`` ``DEVICE`` ``DEVICE`` +``HOST`` ``BOTH`` (transfer h2d) ``DEVICE`` (transfer h2d unless write-only) +``BOTH`` ``BOTH`` ``DEVICE`` +====================== ============================== ================================================== + +Accessing data on the host initiates a device to host data transfer if the +:class:`~pyop2.Dat` is in state ``DEVICE`` and leaves it in state ``HOST`` +when using the :meth:`~pyop2.Dat.data` property and ``BOTH`` when using +:meth:`~pyop2.Dat.data_ro`. + +The state transitions described above apply in the same way to a +:class:`~pyop2.Global`. A :class:`~pyop2.Const` is read-only, never modified +on device and therefore never out of date on the host. Hence there is no +state ``DEVICE`` and it is not necessary to copy back :class:`~pyop2.Const` +data from device to host. + +.. _cuda_backend: + +CUDA backend +~~~~~~~~~~~~ + +The CUDA backend makes extensive use of PyCUDA_ and its infrastructure for +just-in-time compilation of CUDA kernels and interfacing them to Python. +Linear solvers and sparse matrix data structures are implemented on top of the +`CUSP library`_ and are described in greater detail in :doc:`linear_algebra`. +Code generation uses a template based approach, where a ``__global__`` stub +routine to be called from the host is generated, which takes care of data +marshalling and calling the user kernel as an inline ``__device__`` function. + +We consider the same ``midpoint`` kernel as in the previous examples, which +requires no CUDA-specific modifications and is automatically annotated with a +``__device__`` qualifier. PyCUDA_ automatically generates a host stub for the +generated kernel stub ``__midpoint_stub`` given a list of parameter types. It +takes care of translating Python objects to plain C data types and pointers, +such that a CUDA kernel can be launched straight from Python. The entire CUDA +code PyOP2 generates is as follows: :: + + __device__ void midpoint(double p[2], double *coords[2]) + { + p[0] = ((coords[0][0] + coords[1][0]) + coords[2][0]) / 3.0; + p[1] = ((coords[0][1] + coords[1][1]) + coords[2][1]) / 3.0; + } + + __global__ void __midpoint_stub(int set_size, int set_offset, + double *arg0, + double *ind_arg1, + int *ind_map, + short *loc_map, + int *ind_sizes, + int *ind_offs, + int block_offset, + int *blkmap, + int *offset, + int *nelems, + int *nthrcol, + int *thrcol, + int nblocks) { + extern __shared__ char shared[]; + __shared__ int *ind_arg1_map; + __shared__ int ind_arg1_size; + __shared__ double * ind_arg1_shared; + __shared__ int nelem, offset_b, offset_b_abs; + + double *ind_arg1_vec[3]; + + if (blockIdx.x + blockIdx.y * gridDim.x >= nblocks) return; + if (threadIdx.x == 0) { + int blockId = blkmap[blockIdx.x + blockIdx.y * gridDim.x + block_offset]; + nelem = nelems[blockId]; + offset_b_abs = offset[blockId]; + offset_b = offset_b_abs - set_offset; + + ind_arg1_size = ind_sizes[0 + blockId * 1]; + ind_arg1_map = &ind_map[0 * set_size] + ind_offs[0 + blockId * 1]; + + int nbytes = 0; + ind_arg1_shared = (double *) &shared[nbytes]; + } + + __syncthreads(); + + // Copy into shared memory + for ( int idx = threadIdx.x; idx < ind_arg1_size * 2; idx += blockDim.x ) { + ind_arg1_shared[idx] = ind_arg1[idx % 2 + ind_arg1_map[idx / 2] * 2]; + } + + __syncthreads(); + + // process set elements + for ( int idx = threadIdx.x; idx < nelem; idx += blockDim.x ) { + ind_arg1_vec[0] = ind_arg1_shared + loc_map[0*set_size + idx + offset_b]*2; + ind_arg1_vec[1] = ind_arg1_shared + loc_map[1*set_size + idx + offset_b]*2; + ind_arg1_vec[2] = ind_arg1_shared + loc_map[2*set_size + idx + offset_b]*2; + + midpoint(arg0 + 2 * (idx + offset_b_abs), ind_arg1_vec); + } + } + +The CUDA kernel ``__midpoint_stub`` is launched on the GPU for a specific +number of threads in parallel. Each thread is identified inside the kernel by +its thread id ``threadIdx`` within a block of threads identified by a two +dimensional block id ``blockIdx`` within a grid of blocks. + +As for OpenMP, there is the potential for data races, which are prevented by +colouring the iteration set and computing a parallel execution plan, where all +elements of the same colour can be modified simultaneously. Each colour is +computed by a block of threads in parallel. All threads of a thread block have +access to a shared memory, which is used as a shared staging area initialised +by thread 0 of each block, see lines 30-41 above. A call to +``__syncthreads()`` ensures these initial values are visible to all threads of +the block. After this barrier, all threads cooperatively gather data from the +indirectly accessed :class:`~pyop2.Dat` via the :class:`~pyop2.Map`, followed +by another synchronisation. Following that, each thread loops over the +elements in the partition with an increment of the block size. In each +iteration a thread-private array of pointers to coordinate data in shared +memory is built which is then passed to the ``midpoint`` kernel. As for other +backends, the first, directly accessed, argument, is passed as a pointer to +global device memory with a suitable offset. + +.. _opencl_backend: + +OpenCL backend +~~~~~~~~~~~~~~ + +The other device backend OpenCL is structurally very similar to the CUDA +backend. It uses PyOpenCL_ to interface to the OpenCL drivers and runtime. +Linear algebra operations are handled by PETSc_ as described in +:doc:`linear_algebra`. PyOP2 generates a kernel stub from a template similar +to the CUDA case. + +Consider the ``midpoint`` kernel from previous examples, whose parameters in +the kernel signature are automatically annotated with OpenCL storage +qualifiers. PyOpenCL_ provides Python wrappers for OpenCL runtime functions to +build a kernel from a code string, set its arguments and enqueue the kernel +for execution. It takes care of the necessary conversion from Python objects +to plain C data types. PyOP2 generates the following code for the ``midpoint`` +example: :: + + #define ROUND_UP(bytes) (((bytes) + 15) & ~15) + + void midpoint(__global double p[2], __local double *coords[2]); + void midpoint(__global double p[2], __local double *coords[2]) + { + p[0] = ((coords[0][0] + coords[1][0]) + coords[2][0]) / 3.0; + p[1] = ((coords[0][1] + coords[1][1]) + coords[2][1]) / 3.0; + } + + __kernel __attribute__((reqd_work_group_size(668, 1, 1))) + void __midpoint_stub( + __global double* arg0, + __global double* ind_arg1, + int set_size, + int set_offset, + __global int* p_ind_map, + __global short *p_loc_map, + __global int* p_ind_sizes, + __global int* p_ind_offsets, + __global int* p_blk_map, + __global int* p_offset, + __global int* p_nelems, + __global int* p_nthrcol, + __global int* p_thrcol, + __private int block_offset) { + __local char shared [64] __attribute__((aligned(sizeof(long)))); + __local int offset_b; + __local int offset_b_abs; + __local int active_threads_count; + + int nbytes; + int block_id; + + int i_1; + // shared indirection mappings + __global int* __local ind_arg1_map; + __local int ind_arg1_size; + __local double* __local ind_arg1_shared; + __local double* ind_arg1_vec[3]; + + if (get_local_id(0) == 0) { + block_id = p_blk_map[get_group_id(0) + block_offset]; + active_threads_count = p_nelems[block_id]; + offset_b_abs = p_offset[block_id]; + offset_b = offset_b_abs - set_offset;ind_arg1_size = p_ind_sizes[0 + block_id * 1]; + ind_arg1_map = &p_ind_map[0 * set_size] + p_ind_offsets[0 + block_id * 1]; + + nbytes = 0; + ind_arg1_shared = (__local double*) (&shared[nbytes]); + nbytes += ROUND_UP(ind_arg1_size * 2 * sizeof(double)); + } + barrier(CLK_LOCAL_MEM_FENCE); + + // staging in of indirect dats + for (i_1 = get_local_id(0); i_1 < ind_arg1_size * 2; i_1 += get_local_size(0)) { + ind_arg1_shared[i_1] = ind_arg1[i_1 % 2 + ind_arg1_map[i_1 / 2] * 2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (i_1 = get_local_id(0); i_1 < active_threads_count; i_1 += get_local_size(0)) { + ind_arg1_vec[0] = ind_arg1_shared + p_loc_map[i_1 + 0*set_size + offset_b] * 2; + ind_arg1_vec[1] = ind_arg1_shared + p_loc_map[i_1 + 1*set_size + offset_b] * 2; + ind_arg1_vec[2] = ind_arg1_shared + p_loc_map[i_1 + 2*set_size + offset_b] * 2; + + midpoint((__global double* __private)(arg0 + (i_1 + offset_b_abs) * 2), ind_arg1_vec); + } + } + +Parallel computations in OpenCL are executed by *work items* organised into +*work groups*. OpenCL requires the annotation of all pointer arguments with +the memory region they point to: ``__global`` memory is visible to any work +item, ``__local`` memory to any work item within the same work group and +``__private`` memory is private to a work item. PyOP2 does this annotation +automatically for the user kernel if the OpenCL backend is used. Local memory +therefore corresponds to CUDA's shared memory and private memory is called +local memory in CUDA. The work item id within the work group is accessed via +the OpenCL runtime call ``get_local_id(0)``, the work group id via +``get_group_id(0)``. A barrier synchronisation across all work items of a work +group is enforced with a call to ``barrier(CLK_LOCAL_MEM_FENCE)``. Bearing +these differences in mind, the OpenCL kernel stub is structurally almost +identical to the corresponding CUDA version above. + +The required local memory size per work group ``reqd_work_group_size`` is +computed as part of the execution plan. In CUDA this value is a launch +parameter to the kernel, whereas in OpenCL it needs to be hard coded as a +kernel attribute. + +.. _FEniCS project: http://fenicsproject.org +.. _PyCUDA: http://mathema.tician.de/software/pycuda/ +.. _CUSP library: http://cusplibrary.github.io +.. _PyOpenCL: http://mathema.tician.de/software/pyopencl/ +.. _PETSc: http://www.mcs.anl.gov/petsc/petsc-as/ diff --git a/_sources/caching.rst.txt b/_sources/caching.rst.txt new file mode 100644 index 000000000..6e894ecbb --- /dev/null +++ b/_sources/caching.rst.txt @@ -0,0 +1,112 @@ +.. _caching: + +Caching in PyOP2 +================ + +PyOP2 makes heavy use of caches to ensure performance is not adversely +affected by too many runtime computations. The caching in PyOP2 takes +a number of forms: + +1. Disk-based caching of generated code + + Since compiling a generated code module may be an expensive + operation, PyOP2 caches the generated code on disk such that + subsequent runs of the same simulation will not have to pay a + compilation cost. + +2. In memory caching of generated code function pointers + + Once code has been generated and loaded into the running PyOP2 + process, we cache the resulting callable function pointer for the + lifetime of the process, such that subsequent calls to the same + generated code are fast. + +3. In memory caching of expensive to build objects + + Some PyOP2 objects, in particular :class:`~pyop2.Sparsity` objects, + can be expensive to construct. Since a sparsity does not change if + it is built again with the same arguments, we only construct the + sparsity once for each unique set of arguments. + +The caching strategies for PyOP2 follow from two axioms: + +1. For PyOP2 :class:`~pyop2.Set`\s and :class:`~pyop2.Map`\s, equality + is identity +2. Caches of generated code should depend on metadata, but not data + +The first axiom implies that two :class:`~pyop2.Set`\s or +:class:`~pyop2.Map`\s compare equal if and only if they are the same +object. The second implies that generated code must be *independent* +of the absolute size of the data the :func:`~pyop2.par_loop` that +generated it executed over. For example, the size of the iteration +set should not be part of the key, but the arity of any maps and size +and type of every data item should be. + +On consequence of these rules is that there are effectively two +separate types of cache in PyOP2, object and class caches, +distinguished by where the cache itself lives. + +Class caches +------------ + +These are used to cache objects that depend on metadata, but not +object instances, such are generated code. They are implemented by +the cacheable class inheriting from :class:`~.Cached`. + +.. note:: + + There is currently no eviction strategy for class caches, should + they grow too large, for example by executing many different parallel + loops, an out of memory error can occur + +Object caches +------------- + +These are used to cache objects that are built on top of +:class:`~pyop2.Set`\s and :class:`~pyop2.Map`\s. They are implemented by the +cacheable class inheriting from :class:`~.ObjectCached` and the +caching instance defining a ``_cache`` attribute. + +The motivation for these caches is that cache key for objects such as +sparsities relies on an identical sparsity being built if the +arguments are identical. So that users of the API do not have to +worry too much about carrying around "temporary" objects forever such +that they will hit caches, PyOP2 builds up a hierarchy of caches of +transient objects on top of the immutable sets and maps. + +So, for example, the user can build and throw away +:class:`~pyop2.DataSet`\s as normal in their code. Internally, however, +these instances are cached on the set they are built on top of. Thus, +in the following snippet, we have that ``ds`` and ``ds2`` are the same +object: + +.. code-block:: python + + s = op2.Set(1) + ds = op2.DataSet(s, 10) + ds2 = op2.DataSet(s, 10) + assert ds is ds2 + +The setup of these caches is such that the lifetime of objects in the +cache is tied to the lifetime of both the caching and the cached +object. In the above example, as long as the user program holds a +reference to one of ``s``, ``ds`` or ``ds2`` all three objects will +remain live. As soon as all references are lost, all three become +candidates for garbage collection. + +.. note:: + + The cache eviction strategy for these caches relies on the Python + garbage collector, and hence on the user not holding onto + references to some of either the cached or the caching objects for + too long. Should the objects on which the caches live persist, an + out of memory error may occur. + +Debugging cache leaks +--------------------- + +To debug potential problems with the cache, PyOP2 can be instructed to +print the size of both object and class caches at program exit. This +can be done by setting the environment variable +``PYOP2_PRINT_CACHE_SIZE`` to 1 before running a PyOP2 program, or +passing the ``print_cache_size`` to :func:`~pyop2.init`. diff --git a/_sources/concepts.rst.txt b/_sources/concepts.rst.txt new file mode 100644 index 000000000..f62ae0885 --- /dev/null +++ b/_sources/concepts.rst.txt @@ -0,0 +1,268 @@ +.. _concepts: + +PyOP2 Concepts +============== + +Many numerical algorithms and scientific computations on unstructured meshes +can be viewed as the *independent application* of a *local operation* +everywhere on a mesh. This local operation is often called a computational +*kernel* and its independent application lends itself naturally to parallel +computation. An unstructured mesh can be described by *sets of entities* +(vertices, edges, cells) and the connectivity between those sets forming the +topology of the mesh. + +PyOP2 is a domain-specific language (DSL) for the parallel executions of +computational kernels on unstructured meshes or graphs. + +.. _sets: + +Sets and mappings +----------------- + +A mesh is defined by :class:`sets ` of entities and +:class:`mappings ` between these sets. Sets are used to represent +entities in the mesh (nodes in the graph) or degrees of freedom of data +(fields) living "on" the mesh (graph), while maps define the connectivity +between entities (links in the graph) or degrees of freedom, for example +associating an edge with its incident vertices. Sets of mesh entities may +coincide with sets of degrees of freedom, but this is not necessarily the case +e.g. the set of degrees of freedom for a field may be defined on the vertices +of the mesh and the midpoints of edges connecting the vertices. + +.. note :: + There is a requirement for the map to be of *constant arity*, that is each + element in the source set must be associated with a constant number of + elements in the target set. There is no requirement for the map to be + injective or surjective. This restriction excludes certain kinds of mappings + e.g. a map from vertices to incident egdes or cells is only possible on a + very regular mesh where the multiplicity of any vertex is constant. + +In the following we declare a :class:`~pyop2.Set` ``vertices``, a +:class:`~pyop2.Set` ``edges`` and a :class:`~pyop2.Map` ``edges2vertices`` +between them, which associates the two incident vertices with each edge: :: + + vertices = op2.Set(4) + edges = op2.Set(3) + edges2vertices = op2.Map(edges, vertices, 2, [[0, 1], [1, 2], [2, 3]]) + +.. _data: + +Data +---- + +PyOP2 distinguishes three kinds of user provided data: data that lives on a +set (often referred to as a field) is represented by a :class:`~pyop2.Dat`, +data that has no association with a set by a :class:`~pyop2.Global` and data +that is visible globally and referred to by a unique identifier is declared as +:class:`~pyop2.Const`. Examples of the use of these data types are given in +the :ref:`par_loops` section below. + +.. _data_dat: + +Dat +~~~ + +Since a set does not have any type but only a cardinality, data declared on a +set through a :class:`~pyop2.Dat` needs additional metadata to allow PyOP2 to +interpret the data and to specify how much memory is required to store it. This +metadata is the *datatype* and the *shape* of the data associated with any +given set element. The shape is not associated with the :class:`~pyop2.Dat` +directly, but with a :class:`~pyop2.DataSet`. One can associate a scalar with +each element of the set or a one- or higher-dimensional vector. Similar to the +restriction on maps, the shape and therefore the size of the data associated +which each element needs to be uniform. PyOP2 supports all common primitive +data types supported by `NumPy`_. Custom datatypes are supported insofar as +the user implements the serialisation and deserialisation of that type into +primitive data that can be handled by PyOP2. + +Declaring coordinate data on the ``vertices`` defined above, where two float +coordinates are associated with each vertex, is done like this: :: + + dvertices = op2.DataSet(vertices, dim=2) + coordinates = op2.Dat(dvertices, + [[0.0, 0.0], [0.0, 1.0], [1.0, 1.0], [1.0, 0.0]], + dtype=float) + +.. _data_global: + +Global +~~~~~~ + +In contrast to a :class:`~pyop2.Dat`, a :class:`~pyop2.Global` has no +association to a set and the shape and type of the data are declared directly +on the :class:`~pyop2.Global`. A 2x2 elasticity tensor would be defined as +follows: :: + + elasticity = op2.Global((2, 2), [[1.0, 0.0], [0.0, 1.0]], dtype=float) + +.. _data_const: + +Const +~~~~~ + +Data that is globally visible and read-only to kernels is declared with a +:class:`~pyop2.Const` and needs to have a globally unique identifier. It does +not need to be declared as an argument to a :func:`~pyop2.par_loop`, but is +accessible in a kernel by name. A globally visible parameter ``eps`` would be +declared as follows: :: + + eps = op2.Const(1, 1e-14, name="eps", dtype=float) + +.. _data_mat: + +Mat +~~~ + +In a PyOP2 context, a (sparse) matrix is a linear operator from one set to +another. In other words, it is a linear function which takes a +:class:`~pyop2.Dat` on one set :math:`A` and returns the value of a +:class:`~pyop2.Dat` on another set :math:`B`. Of course, in particular, +:math:`A` may be the same set as :math:`B`. This makes the operation of at +least some matrices equivalent to the operation of a particular PyOP2 kernel. + +PyOP2 can be used to assemble :class:`matrices `, which are defined +on a :class:`sparsity pattern ` which is built from a pair of +:class:`DataSets ` defining the row and column spaces the +sparsity maps between and one or more pairs of maps, one for the row and one +for the column space of the matrix respectively. The sparsity uniquely defines +the non-zero structure of the sparse matrix and can be constructed purely from +those mappings. To declare a :class:`~pyop2.Mat` on a :class:`~pyop2.Sparsity` +only the data type needs to be given. + +Since the construction of large sparsity patterns is a very expensive +operation, the decoupling of :class:`~pyop2.Mat` and :class:`~pyop2.Sparsity` +allows the reuse of sparsity patterns for a number of matrices without +recomputation. In fact PyOP2 takes care of caching sparsity patterns on behalf +of the user, so declaring a sparsity on the same maps as a previously declared +sparsity yields the cached object instead of building another one. + +Defining a matrix of floats on a sparsity which spans from the space of +vertices to the space of vertices via the edges is done as follows: :: + + sparsity = op2.Sparsity((dvertices, dvertices), + [(edges2vertices, edges2vertices)]) + matrix = op2.Mat(sparsity, float) + +.. _par_loops: + +Parallel loops +-------------- + +Computations in PyOP2 are executed as :func:`parallel loops ` +of a :class:`~pyop2.Kernel` over an *iteration set*. Parallel loops are the +core construct of PyOP2 and hide most of its complexity such as parallel +scheduling, partitioning, colouring, data transfer from and to device and +staging of the data into on chip memory. Computations in a parallel loop must +be independent of the order in which they are executed over the set to allow +PyOP2 maximum flexibility to schedule the computation in the most efficient +way. Kernels are described in more detail in :doc:`kernels`. + +.. _loop-invocations: + +Loop invocations +~~~~~~~~~~~~~~~~ + +A parallel loop invocation requires as arguments, other than the iteration set +and the kernel to operate on, the data the kernel reads and/or writes. A +parallel loop argument is constructed by calling the underlying data object +(i.e. the :class:`~pyop2.Dat` or :class:`~pyop2.Global`) and passing an +*access descriptor* and the mapping to be used when accessing the data. The +mapping is required for an *indirectly accessed* :class:`~pyop2.Dat` not +declared on the same set as the iteration set of the parallel loop. In the +case of *directly accessed* data defined on the same set as the iteration set +the map is omitted and only an access descriptor given. + +Consider a parallel loop that translates the ``coordinate`` field by a +constant offset given by the :class:`~pyop2.Const` ``offset``. Note how the +kernel has access to the local variable ``offset`` even though it has not been +passed as an argument to the :func:`~pyop2.par_loop`. This loop is direct and +the argument ``coordinates`` is read and written: :: + + op2.Const(2, [1.0, 1.0], dtype=float, name="offset"); + + translate = op2.Kernel("""void translate(double * coords) { + coords[0] += offset[0]; + coords[1] += offset[1]; + }""", "translate") + + op2.par_loop(translate, vertices, coordinates(op2.RW)) + +.. _access-descriptors: + +Access descriptors +~~~~~~~~~~~~~~~~~~ + +Access descriptors define how the data is accessed by the kernel and give +PyOP2 crucial information as to how the data needs to be treated during +staging in before and staging out after kernel execution. They must be one of +:data:`pyop2.READ` (read-only), :data:`pyop2.WRITE` (write-only), +:data:`pyop2.RW` (read-write), :data:`pyop2.INC` (increment), +:data:`pyop2.MIN` (minimum reduction) or :data:`pyop2.MAX` (maximum +reduction). + +Not all of these descriptors apply to all PyOP2 data types. A +:class:`~pyop2.Dat` can have modes :data:`~pyop2.READ`, :data:`~pyop2.WRITE`, +:data:`~pyop2.RW` and :data:`~pyop2.INC`. For a :class:`~pyop2.Global` the +valid modes are :data:`~pyop2.READ`, :data:`~pyop2.INC`, :data:`~pyop2.MIN` and +:data:`~pyop2.MAX` and for a :class:`~pyop2.Mat` only :data:`~pyop2.WRITE` and +:data:`~pyop2.INC` are allowed. + +.. _matrix-loops: + +Loops assembling matrices +~~~~~~~~~~~~~~~~~~~~~~~~~ + +We declare a parallel loop assembling the ``matrix`` via a given ``kernel`` +which we'll assume has been defined before over the ``edges`` and with +``coordinates`` as input data. The ``matrix`` is the output argument of this +parallel loop and therefore has the access descriptor :data:`~pyop2.INC` since +the assembly accumulates contributions from different vertices via the +``edges2vertices`` mapping. Note that the mappings are being indexed with the +:class:`iteration indices ` ``op2.i[0]`` and +``op2.i[1]`` respectively. This means that PyOP2 generates a :ref:`local +iteration space ` of size ``arity * arity`` with the +``arity`` of the :class:`~pyop2.Map` ``edges2vertices`` for any given element +of the iteration set. This local iteration space is then iterated over using +the iteration indices on the maps. The kernel is assumed to only apply to a +single point in that local iteration space. The ``coordinates`` are accessed +via the same mapping, but are a read-only input argument to the kernel and +therefore use the access descriptor :data:`~pyop2.READ`: :: + + op2.par_loop(kernel, edges, + matrix(op2.INC, (edges2vertices[op2.i[0]], + edges2vertices[op2.i[1]])), + coordinates(op2.READ, edges2vertices)) + +You can stack up multiple successive parallel loops that add values to +a matrix, before you use the resulting values, you must explicitly +tell PyOP2 that you want to do so, by calling +:meth:`~pyop2.Mat.assemble` on the matrix. Note that executing a +:func:`~pyop2.solve` will do this automatically for you. + +.. _reduction-loops: + +Loops with global reductions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:class:`Globals ` are used primarily for reductions where a +given quantity on a field is reduced to a single number by summation or +finding the minimum or maximum. Consider a kernel computing the `L2 norm`_ of +the ``pressure`` field defined on the set of ``vertices`` as ``l2norm``. Note +that the :class:`~pyop2.Dat` constructor automatically creates an anonymous +:class:`~pyop2.DataSet` of dimension 1 if a :class:`~pyop2.Set` is passed as +the first argument. We assume ``pressure`` is the result of some prior +computation and only give the declaration for context. :: + + pressure = op2.Dat(vertices, [...], dtype=float) + l2norm = op2.Global(dim=1, data=[0.0]) + + norm = op2.Kernel("""void norm(double * out, double * field) { + *out += field[0] * field[0]; + }""", "norm") + + op2.par_loop(pressure, vertices, + l2norm(op2.INC), + vertices(op2.READ)) + +.. _NumPy: http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html +.. _L2 norm: https://en.wikipedia.org/wiki/L2_norm#Euclidean_norm diff --git a/_sources/index.rst.txt b/_sources/index.rst.txt new file mode 100644 index 000000000..50e2f8930 --- /dev/null +++ b/_sources/index.rst.txt @@ -0,0 +1,44 @@ +.. PyOP2 documentation master file, created by + sphinx-quickstart on Tue Aug 14 10:10:00 2012. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to PyOP2's documentation! +================================= + +.. warning:: + The prose documentation contained here is significantly out-of-date and thus + contains many inaccuracies. It is, nevertheless, quite a useful resource for + people new to PyOP2. Please read with care. + + The API documentation, however, is updated regularly and can be considered + accurate. + +Contents: + +.. toctree:: + :maxdepth: 2 + + installation + concepts + kernels + ir + architecture + backends + linear_algebra + plan + mixed + mpi + caching + profiling + user + pyop2 + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/_sources/installation.rst.txt b/_sources/installation.rst.txt new file mode 100644 index 000000000..44dcf0348 --- /dev/null +++ b/_sources/installation.rst.txt @@ -0,0 +1,20 @@ +.. image:: https://travis-ci.org/OP2/PyOP2.png?branch=master + :target: https://travis-ci.org/OP2/PyOP2 + :alt: build status + +.. contents:: + +Installing PyOP2 +================ + +PyOP2 requires Python 3.6 or later. + +The main testing platform for PyOP2 is Ubuntu 18.04 64-bit with Python +3.6. Later Ubuntu versions should also work. Some users successfully +use PyOP2 on Mac OS X. + +Installation of the dependencies is somewhat involved, and therefore +the recommended way to obtain PyOP2 is by using the `Firedrake +installation script +`__. This will give +you a Python 3 venv that contains a working PyOP2 installation. diff --git a/_sources/ir.rst.txt b/_sources/ir.rst.txt new file mode 100644 index 000000000..9d9ea13f9 --- /dev/null +++ b/_sources/ir.rst.txt @@ -0,0 +1,324 @@ +The PyOP2 Intermediate Representation +===================================== + +The :class:`parallel loop ` is the main construct of PyOP2. +It applies a specific :class:`~pyop2.Kernel` to all elements in the iteration +set of the parallel loop. Here, we describe how to use the PyOP2 API to build +a kernel and, also, we provide simple guidelines on how to write efficient +kernels. + +Using the Intermediate Representation +------------------------------------- + +In the :doc:`previous section `, we described the API for +PyOP2 kernels in terms of the C code that gets executed. +Passing in a string of C code is the simplest way of creating a +:class:`~pyop2.Kernel`. Another possibility is to use PyOP2 Intermediate +Representation (IR) objects to express the :class:`~pyop2.Kernel` semantics. + +An Abstract Syntax Tree of the kernel code can be manually built using IR +objects. Since PyOP2 has been primarily thought to be fed by higher layers +of abstractions, rather than by users, no C-to-AST parser is currently provided. +The advantage of providing an AST, instead of C code, is that it enables PyOP2 +to inspect and transform the kernel, which is aimed at achieving performance +portability among different architectures and, more generally, better execution +times. + +For the purposes of exposition, let us consider a simple +kernel ``init`` which initialises the members of a :class:`~pyop2.Dat` +to zero. + +.. code-block:: python + + from op2 import Kernel + + code = """void init(double* edge_weight) { + for (int i = 0; i < 3; i++) + edge_weight[i] = 0.0; + }""" + kernel = Kernel(code, "init") + +Here, we describe how we can use PyOP2 IR objects to build an AST for +the this kernel. For example, the most basic AST one can come up with +is + +.. code-block:: python + + from op2 import Kernel + from ir.ast_base import * + + ast = FlatBlock("""void init(double* edge_weight) { + for (int i = 0; i < 3; i++) + edge_weight[i] = 0.0; + }""") + kernel = Kernel(ast, "init") + +The :class:`~pyop2.ir.ast_base.FlatBlock` object encapsulates a "flat" block +of code, which is not modified by the IR engine. A +:class:`~pyop2.ir.ast_base.FlatBlock` is used to represent (possibly large) +fragments of code for which we are not interested in any kind of +transformation, so it may be particularly useful to speed up code development +when writing, for example, test cases or non-expensive kernels. On the other +hand, time-demanding kernels should be properly represented using a "real" +AST. For example, an useful AST for ``init`` could be the following + +.. code-block:: python + + from op2 import Kernel + from ir.ast_base import * + + ast_body = [FlatBlock("...some code can go here..."), + c_for("i", 3, Assign(Symbol("edge_weight", ("i",)), c_sym("0.0")))] + ast = FunDecl("void", "init", + [Decl("double*", c_sym("edge_weight"))], + ast_body) + kernel = Kernel(ast, "init") + +In this example, we first construct the body of the kernel function. We have +an initial :class:`~pyop2.ir.ast_base.FlatBlock` that contains, for instance, +some sort of initialization code. :func:`~pyop2.ir.ast_base.c_for` is a shortcut +for building a :class:`for loop `. It takes an +iteration variable (``i``), the extent of the loop and its body. Multiple +statements in the body can be passed in as a list. +:func:`~pyop2.ir.ast_base.c_sym` is a shortcut for building :class:`symbols +`. You may want to use +:func:`~pyop2.ir.ast_base.c_sym` when the symbol makes no explicit use of +iteration variables. + +We use :class:`~pyop2.ir.ast_base.Symbol` instead of +:func:`~pyop2.ir.ast_base.c_sym`, when ``edge_weight`` accesses a specific +element using the iteration variable ``i``. This is fundamental to allow the +IR engine to perform many kind of transformations involving the kernel's +iteration space(s). Finally, the signature of the function is constructed +using the :class:`~pyop2.ir.ast_base.FunDecl`. + +Other examples on how to build ASTs can be found in the tests folder, +particularly looking into ``test_matrices.py`` and +``test_iteration_space_dats.py``. + + +Achieving Performance Portability with the IR +--------------------------------------------- + +One of the key objectives of PyOP2 is obtaining performance portability. +This means that exactly the same program can be executed on a range of +different platforms, and that the PyOP2 engine will strive to get the best +performance out of the chosen platform. PyOP2 allows users to write kernels +by completely abstracting from the underlying machine. This is mainly +achieved in two steps: + +* Given the AST of a kernel, PyOP2 applies a first transformation aimed at + mapping the parallelism inherent to the kernel to that available in the + backend. +* Then, PyOP2 applies optimizations to the sequential code, depending on the + underlying backend. + +To maximize the outcome of the transformation process, it is important that +kernels are written as simply as possible. That is, premature optimization, +possibly for a specific backend, might harm performance. + +A minimal language, the so-called PyOP2 Kernel Domain-Specific Language, is +used to trigger specific transformations. If we had had a parser from C +code to AST, we would have embedded this DSL in C by means of ``pragmas``. +As we directly build an AST, we achieve the same goal by decorating AST nodes +with specific attributes, added at node creation-time. An overview of the +language follows + +* ``pragma pyop2 itspace``. This is added to :class:`~pyop2.ir.ast_base.For` + nodes (i.e. written on top of for loops). It tells PyOP2 that the following + is a fully-parallel loop, that is all of its iterations can be executed in + parallel without any sort of synchronization. +* ``pragma pyop2 assembly(itvar1, itvar2)``. This is added to a statement node, + to denote that we are performing a local assembly operation along to the + ``itvar1`` and ``itvar2`` dimensions. +* ``pragma pyop2 simd``. This is added on top of the kernel signature. It is + used to suggest PyOP2 to apply SIMD vectorization along the ParLoop's + iteration set dimension. This kind of vectorization is also known as + *inter-kernel vectorization*. This feature is currently not supported + by PyOP2, and will be added only in a future release. + +The ``itspace`` pragma tells PyOP2 how to extract parallelism from the kernel. +Consider again our usual example. To expose a parallel iteration space, one +one must write + +.. code-block:: python + + from op2 import Kernel + + code = """void init(double* edge_weight) { + #pragma pyop2 itspace + for (int i = 0; i < 3; i++) + edge_weight[i] = 0.0; + }""" + kernel = Kernel(code, "init") + +The :func:`~pyop2.ir.ast_base.c_for` shortcut when creating an AST expresses +the same semantics of a for loop decorated with a ``pragma pyop2 itspace``. + +Now, imagine we are executing the ``init`` kernel on a CPU architecture. +Typically we want a single core to execute the entire kernel, because it is +very likely that the kernel's iteration space is small and its working set +fits the L1 cache, and no benefit would be gained by splitting the computation +between distinct cores. On the other end, if the backend is a GPU or an +accelerator, a different execution model might give better performance. +There's a huge amount of parallelism available, for example, in a GPU, so +delegating the execution of an individual iteration (or a chunk of iterations) +to a single thread could pay off. If that is the case, the PyOP2 IR engine +re-structures the kernel code to exploit such parallelism. + +Optimizing kernels on CPUs +-------------------------- + +So far, some effort has been spent on optimizations for CPU platforms. Being a +DSL, PyOP2 provides specific support for those (linear algebra) operations that +are common among unstructured-mesh-based numerical methods. For example, PyOP2 +is capable of aggressively optimizing local assembly codes for applications +based on the Finite Element Method. We therefore distinguish optimizations in +two categories: + +* Generic optimizations, such as data alignment and support for autovectorization. +* Domain-specific optimizations (DSO) + +To trigger DSOs, statements must be decorated using the kernel DSL. For example, +if the kernel computes the local assembly of an element in an unstructured mesh, +then a ``pragma pyop2 assembly(itvar1, itvar2)`` should be added on top of the +corresponding statement. When constructing the AST of a kernel, this can be +simply achieved by + +.. code-block:: python + + from ir.ast_base import * + + s1 = Symbol("X", ("i",)) + s2 = Symbol("Y", ("j",)) + tensor = Symbol("A", ("i", "j")) + pragma = "#pragma pyop2 outerproduct(j,k)" + code = c_for("i", 3, c_for("j", 3, Incr(tensor, Prod(s1, s2), pragma))) + +That, conceptually, corresponds to + +.. code-block:: c + + #pragma pyop2 itspace + for (int i = 0; i < 3; i++) + #pragma pyop2 itspace + for (int j = 0; j < 3; j++) + #pragma pyop2 assembly(i, j) + A[i][j] += X[i]*Y[j] + +Visiting the AST, PyOP2 finds a 2-dimensional iteration space and an assembly +statement. Currently, ``#pragma pyop2 itspace`` is ignored when the backend is +a CPU. The ``#pragma pyop2 assembly(i, j)`` can trigger multiple DSOs. +PyOP2 currently lacks an autotuning system that automatically finds out the +best possible kernel implementation; that is, the optimizations that minimize +the kernel run-time. To drive the optimization process, the user (or the +higher layer) can specify which optimizations should be applied. Currently, +PyOP2 can automate: + +* Alignment and padding of data structures: for issuing aligned loads and stores. +* Loop trip count adjustment according to padding: useful for autovectorization + when the trip count is not a multiple of the vector length +* Loop-invariant code motion and autovectorization of invariant code: this is + particularly useful since trip counts are typically small, and hoisted code + can still represent a significant proportion of the execution time +* Register tiling for rectangular iteration spaces +* (DSO for pragma assembly): Outer-product vectorization + unroll-and-jam of + outer loops to improve register re-use or to mitigate register pressure + +How to select specific kernel optimizations +------------------------------------------- + +When constructing a :class:`~pyop2.Kernel`, it is possible to specify the set +of optimizations we want PyOP2 to apply. The IR engine will analyse the kernel +AST and will try to apply, incrementally, such optimizations. The PyOP2's FFC +interface, which build a :class:`~pyop2.Kernel` object given an AST provided +by FFC, makes already use of the available optimizations. Here, we take the +emblematic case of the FFC interface and describe how to play with the various +optimizations through a series of examples. + +.. code-block:: python + + ast = ... + opts = {'licm': False, + 'tile': None, + 'ap': False, + 'vect': None} + kernel = Kernel(ast, 'my_kernel', opts) + +In this example, we have an AST ``ast`` and we specify optimizations through +the dictionary ``opts``; then, we build the :class:`~pyop2.Kernel`, passing in +the optional argument ``opts``. No optimizations are enabled here. The +possible options are: + +* ``licm``: Loop-Invariant Code Motion. +* ``tile``: Register Tiling (of rectangular iteration spaces) +* ``ap``: Data alignment, padding. Trip count adjustment. +* ``vect``: SIMD intra-kernel vectorization. + +If we wanted to apply both loop-invariant code motion and data alignment, we +would simply write + +.. code-block:: python + + ast = ... + opts = {'licm': True, + 'ap': True} + kernel = Kernel(ast, 'my_kernel', opts) + +Now, let's assume we know the kernel has a rectangular iteration space. We want +to try register tiling, with a particular tile size. The way to get it is + +.. code-block:: python + + ast = ... + opts = {'tile': (True, 8)} + kernel = Kernel(ast, 'my_kernel', opts) + +In this case, the iteration space is sliced into tiles of size 8x8. If the +iteration space is smaller than the slice, then the transformation is not +applied. By specifying ``-1`` instead of ``8``, we leave PyOP2 free to choose +automatically a certain tile size. + +A fundamental optimization for any PyOP2 kernel is SIMD vectorization. This is +because almost always kernels fit the L1 cache and are likely to be compute- +bound. Backend compilers' AutoVectorization (AV) is therefore an opportunity. +By enforcing data alignment and padding, we can increase the chance AV is +successful. To try AV, one should write + +.. code-block:: python + + import ir.ast_plan as ap + + ast = ... + opts = {'ap': True, + 'vect': (ap.AUTOVECT, -1)} + kernel = Kernel(ast, 'my_kernel', opts) + +The ``vect``'s second parameter (-1) is ignored when AV is requested. +If our kernel is computing an assembly-like operation, then we can ask PyOP2 +to optimize for register locality and register pressure, by resorting to a +different vectorization technique. Early experiments show that this approach +can be particularly useful when the amount of data movement in the assembly +loops is "significant". Of course, this depends on kernel parameters (e.g. +size of assembly loop, number and size of arrays involved in the assembly) as +well as on architecture parameters (e.g. size of L1 cache, number of available +registers). This strategy takes the name of *Outer-Product Vectorization* +(OP), and can be activated in the following way (again, we suggest to use it +along with data alignment and padding). + +.. code-block:: python + + import ir.ast_plan as ap + + ast = ... + opts = {'ap': True, + 'vect': (ap.V_OP_UAJ, 1)} + kernel = Kernel(ast, 'my_kernel', opts) + +``UAJ`` in ``V_OP_UAJ`` stands for ``Unroll-and-Jam``. It has been proved that +OP shows a much better performance when used in combination with unrolling the +outer assembly loop and incorporating (*jamming*) the unrolled iterations +within the inner loop. The second parameter, therefore, specifies the unroll- +and-jam factor: the higher it is, the larger is the number of iterations +unrolled. A factor 1 means that no unroll-and-jam is performed. The optimal +factor highly depends on the computational characteristics of the kernel. diff --git a/_sources/kernels.rst.txt b/_sources/kernels.rst.txt new file mode 100644 index 000000000..23dcc7307 --- /dev/null +++ b/_sources/kernels.rst.txt @@ -0,0 +1,234 @@ +.. _kernels: + +PyOP2 Kernels +============= + +Kernels in PyOP2 define the local operations that are to be performed for each +element of the iteration set the kernel is executed over. There must be a one +to one match between the arguments declared in the kernel signature and the +actual arguments passed to the parallel loop executing this kernel. As +described in :doc:`concepts`, data is accessed directly on the iteration set +or via mappings passed in the :func:`~pyop2.par_loop` call. + +The kernel only sees data corresponding to the current element of the +iteration set it is invoked for. Any data read by the kernel i.e. accessed as +:data:`~pyop2.READ`, :data:`~pyop2.RW` or :data:`~pyop2.INC` is automatically +gathered via the mapping relationship in the *staging in* phase and the kernel +is passed pointers to the staging memory. Similarly, after the kernel has been +invoked, any modified data i.e. accessed as :data:`~pyop2.WRITE`, +:data:`~pyop2.RW` or :data:`~pyop2.INC` is scattered back out via the +:class:`~pyop2.Map` in the *staging out* phase. It is only safe for a kernel +to manipulate data in the way declared via the access descriptor in the +parallel loop call. Any modifications to an argument accessed read-only would +not be written back since the staging out phase is skipped for this argument. +Similarly, the result of reading an argument declared as write-only is +undefined since the data has not been staged in. + +.. _kernel-api: + +Kernel API +---------- + +Consider a :func:`~pyop2.par_loop` computing the midpoint of a triangle given +the three vertex coordinates. Note that we make use of a covenience in the +PyOP2 syntax, which allow declaring an anonymous :class:`~pyop2.DataSet` of a +dimension greater one by using the ``**`` operator. We omit the actual data in +the declaration of the :class:`~pyop2.Map` ``cell2vertex`` and +:class:`~pyop2.Dat` ``coordinates``. :: + + vertices = op2.Set(num_vertices) + cells = op2.Set(num_cells) + + cell2vertex = op2.Map(cells, vertices, 3, [...]) + + coordinates = op2.Dat(vertices ** 2, [...], dtype=float) + midpoints = op2.Dat(cells ** 2, dtype=float) + + op2.par_loop(midpoint, cells, + midpoints(op2.WRITE), + coordinates(op2.READ, cell2vertex)) + +Kernels are implemented in a restricted subset of C99 and are declared by +passing a *C code string* and the *kernel function name*, which must match the +name in the C kernel signature, to the :class:`~pyop2.Kernel` constructor: :: + + midpoint = op2.Kernel(""" + void midpoint(double p[2], double *coords[2]) { + p[0] = (coords[0][0] + coords[1][0] + coords[2][0]) / 3.0; + p[1] = (coords[0][1] + coords[1][1] + coords[2][1]) / 3.0; + }""", "midpoint") + +Since kernels cannot return any value, the return type is always ``void``. The +kernel argument ``p`` corresponds to the third :func:`~pyop2.par_loop` +argument ``midpoints`` and ``coords`` to the fourth argument ``coordinates`` +respectively. Argument names need not agree, the matching is by position. + +Data types of kernel arguments must match the type of data passed to the +parallel loop. The Python types :class:`float` and :class:`numpy.float64` +correspond to a C :class:`double`, :class:`numpy.float32` to a C +:class:`float`, :class:`int` or :class:`numpy.int64` to a C :class:`long` and +:class:`numpy.int32` to a C :class:`int`. + +Direct :func:`~pyop2.par_loop` arguments such as ``midpoints`` are passed to +the kernel as a ``double *``, indirect arguments such as ``coordinates`` as a +``double **`` with the first indirection due to the map and the second +indirection due the data dimension. The kernel signature above uses arrays +with explicit sizes to draw attention to the fact that these are known. We +could have interchangibly used a kernel signature with plain pointers: + +.. code-block:: c + + void midpoint(double * p, double ** coords) + +Argument creation supports an optional flag ``flatten``, which is used +for kernels which expect data to be laid out by component: :: + + midpoint = op2.Kernel(""" + void midpoint(double p[2], double *coords[1]) { + p[0] = (coords[0][0] + coords[1][0] + coords[2][0]) / 3.0; + p[1] = (coords[3][0] + coords[4][0] + coords[5][0]) / 3.0; + }""", "midpoint") + + op2.par_loop(midpoint, cells, + midpoints(op2.WRITE), + coordinates(op2.READ, cell2vertex, flatten=True)) + +.. _data-layout: + +Data layout +----------- + +Data for a :class:`~pyop2.Dat` declared on a :class:`~pyop2.Set` is +stored contiguously for all elements of the set. For each element, +this is a contiguous chunk of data of a shape given by the +:class:`~pyop2.DataSet` ``dim`` and the datatype of the +:class:`~pyop2.Dat`. The size of this chunk is the product of the +extents of the ``dim`` tuple times the size of the datatype. + +During execution of the :func:`~pyop2.par_loop`, the kernel is called +for each element of the iteration set and passed data for each of its +arguments corresponding to the current set element ``i`` only. + +For a directly accessed argument such as ``midpoints`` above, the +kernel is passed a pointer to the beginning of the chunk of data for +the element ``i`` the kernel is currently called for. In CUDA/OpenCL +``i`` is the global thread id since the kernel is launched in parallel +for all elements. + +.. figure:: images/direct_arg.svg + :align: center + + Data layout for a directly accessed :class:`~pyop2.Dat` argument with + ``dim`` 2 + +For an indirectly accessed argument such as ``coordinates`` above, +PyOP2 gathers pointers to the data via the :class:`~pyop2.Map` +``cell2vertex`` used for the indirection. The kernel is passed a list +of pointers of length corresponding to the *arity* of the +:class:`~pyop2.Map`, in the example above 3. Each of these points to +the data chunk for the element in the target :class:`~pyop2.Set` given +by :class:`~pyop2.Map` entries ``(i, 0)``, ``(i, 1)`` and ``(i, 2)``. + +.. figure:: images/indirect_arg.svg + :align: center + + Data layout for a :class:`~pyop2.Dat` argument with ``dim`` 2 indirectly + accessed through a :class:`~pyop2.Map` of ``arity`` 3 + +If the argument is created with the keyword argument ``flatten`` set +to ``True``, a flattened vector of pointers is passed to the kernel. +This vector is of length ``dim * arity`` (where ``dim`` is the product +of the extents of the ``dim`` tuple), which is 6 in the example above. +Each entry points to a single data value of the :class:`~pyop2.Dat`. +The ordering is by component of ``dim`` i.e. the first component of +each data item for each element in the target set pointed to by the +map followed by the second component etc. + +.. figure:: images/indirect_arg_flattened.svg + :align: center + + Data layout for a flattened :class:`~pyop2.Dat` argument with ``dim`` 2 + indirectly accessed through a :class:`~pyop2.Map` of ``arity`` 3 + +.. _local-iteration-spaces: + +Local iteration spaces +---------------------- + +PyOP2 supports complex kernels with large local working set sizes, which may +not run very efficiently on architectures with a limited amount of registers +and on-chip resources. In many cases the resource usage is proportional to the +size of the *local iteration space* the kernel operates on. + +Consider a finite-element local assembly kernel for vector-valued basis +functions of second order on triangles. There are kernels more complex and +computing considerably larger local tensors commonly found in finite-element +computations, in particular for higher-order basis functions, and this kernel +only serves to illustrate the concept. For each element in the iteration set, +this kernel computes a 12x12 local tensor: + +.. code-block:: c + + void kernel(double A[12][12], ...) { + ... + // loops over the local iteration space + for (int j = 0; j < 12; j++) { + for (int k = 0; k < 12; k++) { + A[j][k] += ... + } + } + } + +PyOP2 invokes this kernel for each element in the iteration set: + +.. code-block:: c + + for (int ele = 0; ele < nele; ++ele) { + double A[12][12]; + ... + kernel(A, ...); + } + +To improve the efficiency of executing complex kernels on manycore +platforms, their operation can be distributed among several threads +which each compute a single point in this local iteration space to +increase the level of parallelism and to lower the amount of resources +required per thread. In the case of the kernel above we obtain: + +.. code-block:: c + + void mass(double A[1][1], ..., int j, int k) { + ... + A[0][0] += ... + } + +Note how the doubly nested loop over basis function is hoisted out of the +kernel, which receives its position in the local iteration space to compute as +additional arguments ``j`` and ``k``. PyOP2 then calls the kernel for +each element of the local iteration space for each set element: + +.. code-block:: c + + for (int ele = 0; ele < nele; ++ele) { + double A[1][1]; + ... + for (int j = 0; j < 12; j++) { + for (int k = 0; k < 12; k++) { + kernel(A, ..., j, k); + } + } + } + +On manycore platforms, the local iteration space does not translate into a +loop nest, but rather into a larger number of threads being launched to +compute each of its elements: + +.. figure:: images/iteration_spaces.svg + :align: center + + Local iteration space for a kernel computing a 12x12 local tensor + +PyOP2 needs to be told to loop over this local iteration space by +indexing the corresponding maps with an +:class:`~pyop2.base.IterationIndex` :data:`~pyop2.i` in the +:func:`~pyop2.par_loop` call. diff --git a/_sources/linear_algebra.rst.txt b/_sources/linear_algebra.rst.txt new file mode 100644 index 000000000..176f15498 --- /dev/null +++ b/_sources/linear_algebra.rst.txt @@ -0,0 +1,304 @@ +.. _linear_algebra: + +PyOP2 Linear Algebra Interface +============================== + +PyOP2 supports linear algebra operations on sparse matrices using a thin +wrapper around the PETSc_ library harnessed via its petsc4py_ interface. + +As described in :doc:`concepts`, a sparse matrix is a linear operator that +maps a :class:`~pyop2.DataSet` representing its row space to a +:class:`~pyop2.DataSet` representing its column space and vice versa. These +two spaces are commonly the same, in which case the resulting matrix is +square. A sparse matrix is represented by a :class:`~pyop2.Mat`, which is +declared on a :class:`~pyop2.Sparsity`, representing its non-zero structure. + +.. _matrix_storage: + +Sparse Matrix Storage Formats +----------------------------- + +PETSc_ uses the popular Compressed Sparse Row (CSR) format to only store the +non-zero entries of a sparse matrix. In CSR, a matrix is stored as three +one-dimensional arrays of *row pointers*, *column indices* and *values*, where +the two former are of integer type and the latter of float type, usually +double. As the name suggests, non-zero entries are stored per row, where each +non-zero is defined by a pair of column index and corresponding value. The +column indices and values arrays therefore have a length equal to the total +number of non-zero entries. Row indices are given implicitly by the row +pointer array, which contains the starting index in the column index and +values arrays for the non-zero entries of each row. In other words, the +non-zeros for row ``i`` are at positions ``row_ptr[i]`` up to but not +including ``row_ptr[i+1]`` in the column index and values arrays. For each +row, entries are sorted by column index to allow for faster lookups using a +binary search. + +.. figure:: images/csr.svg + :align: center + + A sparse matrix and its corresponding CSR row pointer, column indices and + values arrays + +For distributed parallel storage with MPI, the rows of the matrix are +distribued evenly among the processors. Each row is then again divided into a +*diagonal* and an *off-diagonal* part, where the diagonal part comprises +columns ``i`` to ``j`` if ``i`` and ``j`` are the first and last row owned by +a given processor, and the off-diagonal part all other rows. + +.. figure:: images/mpi_matrix.svg + :align: center + + Distribution of a sparse matrix among 3 MPI processes + +.. _matrix_assembly: + +Matrix assembly +--------------- + +Sparse matrices are assembled by adding up local contributions which are +mapped to global matrix entries via a local-to-global mapping represented by a +pair of :class:`Maps ` for the row and column space. + +.. figure:: images/assembly.svg + :align: center + + Assembly of a local tensor :math:`A^K` into a global matrix :math:`A` using + the local-to-global mapping :math:`\iota_K^1` for rows and :math:`\iota_K^2` + for columns + +For each :func:`~pyop2.par_loop` that assembles a matrix, PyOP2 generates a +call to PETSc_'s MatSetValues_ function for each element of the iteration set, +adding the local contributions computed by the user kernel to the global +matrix using the given :class:`Maps `. At the end of the +:func:`~pyop2.par_loop` PyOP2 automatically calls MatAssemblyBegin_ and +MatAssemblyEnd_ to finalise matrix assembly. + +Consider assembling a :class:`~pyop2.Mat` on a :class:`~pyop2.Sparsity` built +from a :class:`~pyop2.Map` from ``elements`` to ``nodes``. The assembly is +done in a :func:`~pyop2.par_loop` over ``elements``, where the +:class:`~pyop2.Mat` ``A`` is accssed indirectly via the ``elem_node`` +:class:`~pyop2.Map` using the :class:`~pyop2.base.IterationIndex` +:class:`~pyop2.i`: + +.. code-block:: python + + nodes = op2.Set(NUM_NODES, "nodes") + elements = op2.Set(NUM_ELE, "elements") + + elem_node = op2.Map(elements, nodes, 3, ...) + + sparsity = op2.Sparsity((nodes, nodes), (elem_node, elem_node)) + A = op2.Mat(sparsity, np.float64) + + b = op2.Dat(nodes, dtype=np.float64) + + # Assemble the matrix mat + op2.par_loop(mat_kernel, elements, + A(op2.INC, (elem_node[op2.i[0]], elem_node[op2.i[1]])), + ...) + + # Assemble the right-hand side vector b + op2.par_loop(rhs_kernel, elements, + b(op2.INC, elem_node[op2.i[0]]), + ...) + +The code generated for the :func:`~pyop2.par_loop` assembling the +:class:`~pyop2.Mat` for the sequential backend is similar to the following, +where initialisation and staging code described in :ref:`sequential_backend` +have been omitted for brevity. For each element of the iteration +:class:`~pyop2.Set` a buffer for the local tensor is initialised to zero and +passed to the user kernel performing the local assembly operation. The +``addto_vector`` call subsequently adds this local contribution to the global +sparse matrix. + +.. code-block:: c + + void wrap_mat_kernel__(...) { + ... + for ( int n = start; n < end; n++ ) { + int i = n; + ... + double buffer_arg0_0[3][3] = {{0}}; // local tensor initialised to 0 + mat_kernel(buffer_arg0_0, ...); // local assembly kernel + addto_vector(arg0_0_0, buffer_arg0_0, // Mat objet, local tensor + 3, arg0_0_map0_0 + i * 3, // # rows, global row indices + 3, arg0_0_map1_0 + i * 3, // # cols, global column indices + 0); // mode: 0 add, 1 insert + } + } + +.. _sparsity_pattern: + +Building a sparsity pattern +--------------------------- + +The sparsity pattern of a matrix is uniquely defined by the dimensions of the +:class:`DataSets ` forming its row and column space, and one or +more pairs of :class:`Maps ` defining its non-zero structure. This +is exploited in PyOP2 by caching sparsity patterns with these unique +attributes as the cache key to save expensive recomputation. Whenever a +:class:`Sparsity` is initialised, an already computed pattern with the same +unique key is returned if it exists. + +For a valid sparsity, each row :class:`~pyop2.Map` must map to the set of the +row :class:`~pyop2.DataSet`, each column :class:`~pyop2.Map` to that of the +column :class:`~pyop2.DataSet` and the from sets of each pair must match. A +matrix on a sparsity pattern built from more than one pair of maps is +assembled by multiple parallel loops iterating over the corresponding +iteration set for each pair. + +Sparsity construction proceeds by iterating each :class:`~pyop2.Map` pair and +building a set of indices of the non-zero columns for each row. Each pair of +entries in the row and column maps gives the row and column index of a +non-zero entry in the matrix and therefore the column index is added to the +set of non-zero entries for that particular row. The array of non-zero entries +per row is then determined as the size of the set for each row and its +exclusive scan yields the row pointer array. The column index array is the +concatenation of all the sets. An algorithm for the sequential case is given +below: :: + + for rowmap, colmap in maps: + for e in range(rowmap.from_size): + for i in range(rowmap.arity): + row = rowmap.values[i + e*rowmap.arity] + for d in range(colmap.arity): + diag[row].insert(colmap.values[d + e * colmap.arity]) + +For the MPI parallel case a minor modification is required, since for each row +a set of diagonal and off-diagonal column indices needs to be built as +described in :ref:`matrix_storage`: :: + + for rowmap, colmap in maps: + for e in range(rowmap.from_size): + for i in range(rowmap.arity): + row = rowmap.values[i + e*rowmap.arity] + if row < nrows: + for d in range(colmap.arity): + if col < ncols: + diag[row].insert(colmap.values[d + e*colmap.arity]) + else: + odiag[row].insert(colmap.values[d + e*colmap.arity]) + +.. _solving: + +Solving a linear system +----------------------- + +PyOP2 provides a :class:`~pyop2.Solver`, wrapping the PETSc_ KSP_ Krylov +solvers which support various iterative methods such as Conjugate Gradients +(CG), Generalized Minimal Residual (GMRES), a stabilized version of +BiConjugate Gradient Squared (BiCGStab) and others. The solvers are +complemented with a range of preconditioners from PETSc_'s PC_ collection, +which includes Jacobi, incomplete Cholesky and LU decompositions and various +multigrid based preconditioners. + +The choice of solver and preconditioner type and other parameters uses +PETSc_'s configuration mechanism documented in the `PETSc manual`_. Options +are pased to the :class:`~pyop2.Solver` via the keyword argument +``parameters`` taking a dictionary of arguments or directly via keyword +arguments. The solver type is chosen as ``ksp_type``, the preconditioner as +``pc_type`` with the defaults ``cg`` and ``jacobi``. + +Solving a linear system of the matrix ``A`` assembled above and the right-hand +side vector ``b`` for a solution vector ``x`` is done with a call to +:meth:`~pyop2.Solver.solve`, where solver and preconditioner are chosen as +``gmres`` and ``ilu``: :: + + x = op2.Dat(nodes, dtype=np.float64) + + solver = op2.Solver(ksp_type='gmres', pc_type='ilu') + solver.solve(A, x, b) + +.. _gpu_assembly: + +GPU matrix assembly +------------------- + +In a :func:`~pyop2.par_loop` assembling a :class:`~pyop2.Mat` on the GPU, the +local contributions are first computed for all elements of the iteration set +and stored in global memory in a structure-of-arrays (SoA) data layout such +that all threads can write the data out in a coalesced manner. For the example +above, the generated CUDA wrapper code is as follows, again omitting +initialisation and staging code described in :ref:`cuda_backend`. The user +kernel only computes a single element in the local iteration space as detailed +in :ref:`local-iteration-spaces`. + +.. code-block:: c + + __global__ void __mat_kernel_stub(..., + double *arg0, // local matrix data array + int arg0_offset, // offset into the array + ... ) { + ... // omitted initialisation and shared memory staging code + for ( int idx = threadIdx.x; idx < nelem; idx += blockDim.x ) { + ... // omitted staging code + for ( int i0 = 0; i0 < 3; ++i0 ) { + for ( int i1 = 0; i1 < 3; ++i1 ) { + mass_cell_integral_0_otherwise( + (double (*)[1])(arg0 + arg0_offset + idx * 9 + i0 * 3 + i1 * 1), + ..., i0, i1); + } + } + } + } + +A separate CUDA kernel given below is launched afterwards to compress the data +into a sparse matrix in CSR storage format. Only the values array needs to be +computed, since the row pointer and column indices have already been computed +when building the sparsity on the host and subsequently transferred to GPU +memory. Memory for the local contributions and the values array only needs to +be allocated on the GPU. + +.. code-block:: c + + __global__ void __lma_to_csr(double *lmadata, // local matrix data array + double *csrdata, // CSR values array + int *rowptr, // CSR row pointer array + int *colidx, // CSR column indices array + int *rowmap, // row map array + int rowmapdim, // row map arity + int *colmap, // column map array + int colmapdim, // column map arity + int nelems) { + int nentries_per_ele = rowmapdim * colmapdim; + int n = threadIdx.x + blockIdx.x * blockDim.x; + if ( n >= nelems * nentries_per_ele ) return; + + int e = n / nentries_per_ele; // set element + int i = (n - e * nentries_per_ele) / rowmapdim; // local row + int j = (n - e * nentries_per_ele - i * colmapdim); // local column + + // Compute position in values array + int offset = pos(rowmap[e * rowmapdim + i], colmap[e * colmapdim + j], + rowptr, colidx); + __atomic_add(csrdata + offset, lmadata[n]); + } + +.. _gpu_solve: + +GPU linear algebra +------------------ + +Linear algebra on the GPU with the ``cuda`` backend uses the Cusp_ library, +which does not support all solvers and preconditioners provided by PETSc_. The +interface to the user is the same as for the ``sequential`` and ``openmp`` +backends. Supported solver types are CG (``cg``), GMRES (``gmres``) and +BiCGStab (``bicgstab``), with preconditioners of types Jacobi (``jacobi``), +Bridson approximate inverse (``ainv``) and asymptotic multigrid (``amg``). An +exception is raised if an unsupported solver or preconditioner type is +requested. A Cusp_ solver with the chosen parameters is automatically +generated when :func:`~pyop2.solve` is called. + +.. note :: + Distributed parallel linear algebra operations with MPI are currently not + supported by the ``cuda`` backend. + +.. _PETSc: http://www.mcs.anl.gov/petsc/ +.. _petsc4py: http://pythonhosted.org/petsc4py/ +.. _MatSetValues: http://www.mcs.anl.gov/petsc/petsc-dev/docs/manualpages/Mat/MatSetValues.html +.. _MatAssemblyBegin: http://www.mcs.anl.gov/petsc/petsc-dev/docs/manualpages/Mat/MatAssemblyBegin.html +.. _MatAssemblyEnd: http://www.mcs.anl.gov/petsc/petsc-dev/docs/manualpages/Mat/MatAssemblyEnd.html +.. _KSP: http://www.mcs.anl.gov/petsc/petsc-dev/docs/manualpages/KSP/ +.. _PC: http://www.mcs.anl.gov/petsc/petsc-dev/docs/manualpages/PC/ +.. _PETSc manual: http://www.mcs.anl.gov/petsc/petsc-dev/docs/manual.pdf +.. _Cusp: http://cusplibrary.github.io diff --git a/_sources/mixed.rst.txt b/_sources/mixed.rst.txt new file mode 100644 index 000000000..2227dcf69 --- /dev/null +++ b/_sources/mixed.rst.txt @@ -0,0 +1,144 @@ +.. _mixed: + +Mixed Types +=========== + +When solving linear systems of equations as they arise for instance in the +finite-element method (FEM), one is often interested in *coupled* solutions of +more than one quantity. In fluid dynamics, a common example is solving a +coupled system of velocity and pressure as it occurs in some formulations of +the Navier-Stokes equations. + +Mixed Set, DataSet, Map and Dat +------------------------------- + +PyOP2 provides the mixed types :class:`~pyop2.MixedSet` +:class:`~pyop2.MixedDataSet`, :class:`~pyop2.MixedMap` and +:class:`~pyop2.MixedDat` for a :class:`~pyop2.Set`, :class:`~pyop2.DataSet`, +:class:`~pyop2.Map` and :class:`~pyop2.Dat` respectively. A mixed type is +constructed from a list or other iterable of its base type and provides the +same attributes and methods. Under most circumstances types and mixed types +behave the same way and can be treated uniformly. Mixed types allow iteration +over their constituent parts and for convenience the base types are also +iterable, yielding themselves. + +A :class:`~pyop2.MixedSet` is defined from a list of sets: :: + + s1, s2 = op2.Set(N), op2.Set(M) + ms = op2.MixedSet([s1, s2]) + +There are a number of equivalent ways of defining a +:class:`~pyop2.MixedDataSet`: :: + + mds = op2.MixedDataSet([s1, s2], (1, 2)) + mds = op2.MixedDataSet([s1**1, s2**2]) + mds = op2.MixedDataSet(ms, (1, 2)) + mds = ms**(1, 2) + +A :class:`~pyop2.MixedDat` with no associated data is defined in one of the +following ways: :: + + md = op2.MixedDat(mds) + md = op2.MixedDat([s1**1, s2**2]) + md = op2.MixedDat([op2.Dat(s1**1), op2.Dat(s2**2)]) + +Finally, a :class:`~pyop2.MixedMap` is defined from a list of maps, all of +which must share the same source :class:`~pyop2.Set`: :: + + it = op2.Set(S) + mm = op2.MixedMap([op2.Map(it, s1, 2), op2.Map(it, s2, 3)]) + +Block Sparsity and Mat +---------------------- + +When declaring a :class:`~pyop2.Sparsity` on pairs of mixed maps, the +resulting sparsity pattern has a square block structure with as many block +rows and columns as there are components in the :class:`~pyop2.MixedDataSet` +forming its row and column space. In the most general case a +:class:`~pyop2.Sparsity` is constructed as follows: :: + + it = op2.Set(...) # Iteration set + sr0, sr1 = op2.Set(...), op2.Set(...) # Sets for row spaces + sc0, sc1 = op2.Set(...), op2.Set(...) # Sets for column spaces + # MixedMaps for the row and column spaces + mr = op2.MixedMap([op2.Map(it, sr0, ...), op2.Map(it, sr1, ...)]) + mc = op2.MixedMap([op2.Map(it, sc0, ...), op2.Map(it, sc1, ...)]) + # MixedDataSets for the row and column spaces + dsr = op2.MixedDataSet([sr0**1, sr1**1]) + dsc = op2.MixedDataSet([sc0**1, sc1**1]) + # Blocked sparsity + sparsity = op2.Sparsity((dsr, dsc), [(mr, mc), ...]) + +The relationships of each component of the mixed maps and datasets to the +blocks of the :class:`~pyop2.Sparsity` is shown in the following diagram: + +.. figure:: images/mixed_sparsity.svg + :align: center + + The contribution of sets, maps and datasets to the blocked sparsity. + +Block sparsity patterns are computed separately for each block as described in +:ref:`sparsity_pattern` and the same validity rules apply. A +:class:`~pyop2.Mat` defined on a block :class:`~pyop2.Sparsity` has the same +block structure, which is implemented using a PETSc_ MATNEST_. + +Mixed Assembly +-------------- + +When assembling into a :class:`~pyop2.MixedDat` or a block +:class:`~pyop2.Mat`, the :class:`~pyop2.Kernel` produces a local tensor of the +same block structure, which is a combination of :ref:`local-iteration-spaces` +of all its subblocks. This is entirely transparent to the kernel however, +which sees the combined local iteration space. PyOP2 ensures that indirectly +accessed data is gathered and scattered via the correct maps and packed +together into a contiguous vector to be passed to the kernel. Contributions +from the local tensor are assembled into the correct blocks of the +:class:`~pyop2.MixedDat` or :class:`~pyop2.Mat`. + +Consider the following example :func:`~pyop2.par_loop` assembling a block +:class:`~pyop2.Mat`: + +.. code-block:: python + + it, cells, nodes = op2.Set(...), op2.Set(...), op2.Set(...) + mds = op2.MixedDataSet([nodes, cells]) + mmap = op2.MixedMap([op2.Map(it, nodes, 2, ...), op2.Map(it, cells, 1, ...)]) + mat = op2.Mat(op2.Sparsity(mds, mmap)) + d = op2.MixedDat(mds) + + op2.par_loop(kernel, it, + mat(op2.INC, (mmap[op2.i[0]], mmap[op2.i[1]])), + d(op2.read, mmap)) + +The ``kernel`` for this :func:`~pyop2.par_loop` assembles a 3x3 local tensor +and is passed an input vector of length 3 for each iteration set element: + +.. code-block:: c + + void kernel(double v[3][3] , double **d ) { + for (int i = 0; i<3; i++) + for (int j = 0; j<3; j++) + v[i][j] += d[i][0] * d[j][0]; + } + +The top-left 2x2 block of the local tensor is assembled into the (0,0) block +of the matrix, the top-right 2x1 block into (0,1), the bottom-left 1x2 block +into (1,0) and finally the bottom-right 1x1 block into (1,1). Note that for +the (0,0) block only the first component of the :class:`~pyop2.MixedDat` is +read and for the (1,1) block only the second component. For the (0,1) and +(1,0) blocks, both components of the :class:`~pyop2.MixedDat` are accessed. + +This diagram illustrates the assembly of the block :class:`~pyop2.Mat`: + +.. figure:: images/mixed_assembly.svg + :align: center + + Assembling into the blocks of a global matrix :math:`A`: block + :math:`A^{0,0}` uses maps :math:`\iota^{1,0}` and :math:`\iota^{2,0}`, + :math:`A^{0,1}` uses :math:`\iota^{1,0}` and :math:`\iota^{2,1}`, + :math:`A^{1,0}` uses :math:`\iota^{1,1}` and :math:`\iota^{2,0}` and finally + :math:`A^{1,1}` uses :math:`\iota^{1,1}` and :math:`\iota^{2,1}` for the row + and column spaces respectively. + +.. _PETSc: http://www.mcs.anl.gov/petsc/ +.. _MATNEST: http://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/Mat/MATNEST.html diff --git a/_sources/mpi.rst.txt b/_sources/mpi.rst.txt new file mode 100644 index 000000000..360253cda --- /dev/null +++ b/_sources/mpi.rst.txt @@ -0,0 +1,125 @@ +.. _mpi: + +MPI +=== + +Distributed parallel computations with MPI in PyOP2 require the mesh to be +partitioned among the processors. To be able to compute over entities on their +boundaries, partitions need to access data owned by neighboring processors. +This region, called the *halo*, needs to be kept up to date and is therefore +exchanged between the processors as required. + +Local Numbering +--------------- + +The partition of each :class:`~pyop2.Set` local to each process consists of +entities *owned* by the process and the *halo*, which are entities owned by +other processes but required to compute on the boundary of the owned entities. +Each of these sections is again divided into two sections required to +efficiently overlap communication and computation and avoid communication +during matrix assembly as described below. Each locally stored +:class:`~pyop2.Set` entitity therefore belongs to one of four categories: + +* **Core**: Entities owned by this processor which can be processed without + accessing halo data. +* **Owned**: Entities owned by this processor which access halo data when + processed. +* **Exec halo**: Off-processor entities which are redundantly executed over + because they touch owned entities. +* **Non-exec halo**: Off-processor entities which are not processed, but read + when computing the exec halo. + +The following diagram illustrates the four sections for a mesh distributed +among two processors: + +.. figure:: images/pyop2_mpi_mesh.svg + :align: center + + A mesh distributed among two processors with the entities of each mesh + partition divided into *core*, *owned*, *exec halo* and *non-exec halo*. + Matching halo sections are highlighted in matching colours. The owned + section of process 0 correspondonds to the non-exec section of process 1. + +For data defined on the :class:`~pyop2.Set` to be stored contiguously per +section, local :class:`~pyop2.Set` entities must be numbered such that core +entities are first, followed by owned, exec halo and non-exec halo in that +order. A good partitioning maximises the size of the core section and +minimises the halo regions. We can therefore assume that the vast majority of +local :class:`~pyop2.Set` entities are in the core section. + +Computation-communication Overlap +--------------------------------- + +The ordering of :class:`~pyop2.Set` entities into four sections allow for a +very efficient overlap of computation and communication. Core entities that do +not access any halo data can be processed entirely without access to halo data +immediately after the halo exchange has been initiated. Execution over the +owned and exec halo regions requires up to date halo data and can only start +once the halo exchange is completed. Depending on the latency and bandwidth +of communication and the size of the core section relative to the halo, the +halo exchange may complete before the computation on the core section. + +The entire process is given below: :: + + halo_exchange_begin() # Initiate halo exchange + maybe_set_dat_dirty() # Mark Dats as modified + compute_if_not_empty(itset.core_part) # Compute core region + halo_exchange_end() # Wait for halo exchange + compute_if_not_empty(itset.owned_part) # Compute owned region + reduction_begin() # Initiate reductions + if needs_exec_halo: # Any indirect Dat not READ? + compute_if_not_empty(itset.exec_part) # Compute exec halo region + reduction_end() # Wait for reductions + maybe_set_halo_update_needed() # Mark halos as out of date + assemble() # Finalise matrix assembly + +Any reductions depend on data from the core and owned sections and are +initiated as soon as the owned section has been processed and execute +concurrently with computation on the exec halo. Similar to +`halo_exchange_begin` and `halo_exchange_end`, `reduction_begin` and +`reduction_end` do no work at all if none of the :func:`~pyop2.par_loop` +arguments requires a reduction. If the :func:`~pyop2.par_loop` assembles a +:class:`~pyop2.Mat`, the matrix assembly is finalised at the end. + +By dividing entities into sections according to their relation to the halo, +there is no need to check whether or not a given entity touches the halo or +not during computations on each section. This avoids branching in kernels or +wrapper code and allows launching separate kernels for GPU execution of each +section. The :func:`~pyop2.par_loop` execution therefore has the above +structure for all backends. + +Halo exchange +------------- + +Exchanging halo data is only required if the halo data is actually read, which +is the case for :class:`~pyop2.Dat` arguments to a :func:`~pyop2.par_loop` +used in :data:`pyop2.READ` or :data:`pyop2.RW` mode. PyOP2 keeps track +whether or not the halo region may have been modified. This is the case for +:class:`Dats ` used in :data:`pyop2.INC`, :data:`pyop2.WRITE` or +:data:`pyop2.RW` mode or when a :class:`~pyop2.Solver` or a user requests +access to the data. A halo exchange is triggered only for halos marked as out +of date. + +Distributed Assembly +-------------------- + +For an MPI distributed matrix or vector, assembling owned entities at the +boundary can contribute to off-process degrees of freedom and vice versa. + +There are different ways of accounting for these off-process contributions. +PETSc_ supports insertion and subsequent communication of off-process matrix +and vector entries, however its implementation is not thread safe. Concurrent +insertion into PETSc_ MPI matrices *is* thread safe if off-process insertions +are not cached and concurrent writes to rows are avoided, which is done +through colouring as described in :ref:`plan-colouring`. + +PyOP2 therefore disables PETSc_'s off-process insertion feature and instead +redundantly computes over all off process entities that touch local dofs, +which is the *exec halo* section described above. The price for this is +maintaining a larger halo, since we also need halo data, the *non-exec halo* +section, to perform the redundant computation. Halos grow by about a factor +two, however in practice this is still small compared to the interior region +of a partition and the main cost of halo exchange is the latency, which is +independent of the exchanged data volume. + +.. _PETSc: http://www.mcs.anl.gov/petsc/ diff --git a/_sources/plan.rst.txt b/_sources/plan.rst.txt new file mode 100644 index 000000000..613ca8ae2 --- /dev/null +++ b/_sources/plan.rst.txt @@ -0,0 +1,80 @@ +.. _plan: + +Parallel Execution Plan +======================= + +For all PyOP2 backends with the exception of sequential, a parallel execution +plan is computed for each :func:`~pyop2.par_loop`. It contains information +guiding the code generator on how to partition, stage and colour the data for +efficient parallel processing. + +.. _plan-partitioning: + +Partitioning +------------ + +The iteration set is split into a number of equally sized and contiguous +mini-partitions such that the working set of each mini-partition fits into +shared memory or last level cache. This is unrelated to the partitioning +required for MPI as described in :ref:`mpi`. + +.. _plan-renumbering: + +Local Renumbering and Staging +----------------------------- + +While a mini-partition is a contiguous chunk of the iteration set, the +indirectly accessed data it references is not necessarily contiguous. For each +mini-partition and unique :class:`~pyop2.Dat`-:class:`~pyop2.Map` pair, a +mapping from local indices within the partition to global indices is +constructed as the sorted array of unique :class:`~pyop2.Map` indices accessed +by this partition. At the same time, a global-to-local mapping is constructed +as its inverse. + +Data for indirectly accessed :class:`~pyop2.Dat` arguments is staged in shared +device memory as described in :ref:`backends`. For each partition, the +local-to-global mapping indicates where data to be staged in is read from and +the global-to-local mapping gives the location in shared memory data has been +staged at. The amount of shared memory required is computed from the size of +the local-to-global mapping. + +.. _plan-colouring: + +Colouring +--------- + +A two-level colouring is used to avoid race conditions. Partitions are +coloured such that partitions of the same colour can be executed concurrently +and threads executing on a partition in parallel are coloured such that no two +threads indirectly reference the same data. Only :func:`~pyop2.par_loop` +arguments performing an indirect reduction or assembling a matrix require +colouring. Matrices are coloured per row. + +For each element of a :class:`~pyop2.Set` indirectly accessed in a +:func:`~pyop2.par_loop`, a bit vector is used to record which colours +indirectly reference it. To colour each thread within a partition, the +algorithm proceeds as follows: + +1. Loop over all indirectly accessed arguments and collect the colours of all + :class:`~pyop2.Set` elements referenced by the current thread in a bit mask. +2. Choose the next available colour as the colour of the current thread. +3. Loop over all :class:`~pyop2.Set` elements indirectly accessed by the + current thread again and set the new colour in their colour mask. + +Since the bit mask is a 32-bit integer, up to 32 colours can be processed in a +single pass, which is sufficient for most applications. If not all threads can +be coloured with 32 distinct colours, the mask is reset and another pass is +made, where each newly allocated colour is offset by 32. Should another pass +be required, the offset is increased to 64 and so on until all threads are +coloured. + +.. figure:: images/pyop2_colouring.svg + :align: center + + Thread colouring within a mini-partition for a :class:`~pyop2.Dat` on + vertices indirectly accessed in a computation over the edges. The edges are + coloured such that no two edges touch the same vertex within the partition. + +The colouring of mini-partitions is done in the same way, except that all +:class:`~pyop2.Set` elements indirectly accessed by the entire partition are +referenced, not only those accessed by a single thread. diff --git a/_sources/profiling.rst.txt b/_sources/profiling.rst.txt new file mode 100644 index 000000000..aa7cc2baf --- /dev/null +++ b/_sources/profiling.rst.txt @@ -0,0 +1,170 @@ +Profiling +========= + +Profiling PyOP2 programs +------------------------ + +Profiling a PyOP2 program is as simple as profiling any other Python +code. You can profile the jacobi demo in the PyOP2 ``demo`` folder as +follows: :: + + python -m cProfile -o jacobi.dat jacobi.py + +This will run the entire program under cProfile_ and write the profiling +data to ``jacobi.dat``. Omitting ``-o`` will print a summary to stdout, +which is not very helpful in most cases. + +Creating a graph +................ + +There is a much more intuitive way of representing the profiling data +using the excellent gprof2dot_ to generate a graph. Install from `PyPI +`__ with :: + + sudo pip install gprof2dot + +Use as follows to create a PDF: :: + + gprof2dot -f pstats -n 1 jacobi.dat | dot -Tpdf -o jacobi.pdf + +``-f pstats`` tells ``gprof2dot`` that it is dealing with Python +cProfile_ data (and not actual *gprof* data) and ``-n 1`` ignores +everything that makes up less than 1% of the total runtime - most likely +you are not interested in that (the default is 0.5). + +Consolidating profiles from different runs +.......................................... + +To aggregate profiling data from different runs, save the following as +``concat.py``: :: + + """Usage: concat.py PATTERN FILE""" + + import sys + from glob import glob + from pstats import Stats + + if len(sys.argv) != 3: + print __doc__ + sys.exit(1) + files = glob(sys.argv[1]) + s = Stats(files[0]) + for f in files[1:]: s.add(f) + s.dump_stats(sys.argv[2]) + +With profiles from different runs named ``.*.part``, use it +as :: + + python concat.py '.*.part' .dat + +and then call ``gprof2dot`` as before. + +Using PyOP2's internal timers +----------------------------- + +PyOP2 automatically times the execution of certain regions: + +* Sparsity building +* Plan construction +* Parallel loop kernel execution +* Halo exchange +* Reductions +* PETSc Krylov solver + +To output those timings, call :func:`~pyop2.profiling.summary` in your +PyOP2 program or run with the environment variable +``PYOP2_PRINT_SUMMARY`` set to 1. + +To query e.g. the timer for parallel loop execution programatically, +use the :func:`~pyop2.profiling.timing` helper: :: + + from pyop2 import timing + timing("ParLoop compute") # get total time + timing("ParLoop compute", total=False) # get average time per call + +To add additional timers to your own code, you can use the +:func:`~pyop2.profiling.timed_region` and +:func:`~pyop2.profiling.timed_function` helpers: :: + + from pyop2.profiling import timed_region, timed_function + + with timed_region("my code"): + # my code + + @timed_function("my function") + def my_func(): + # my func + +Line-by-line profiling +---------------------- + +To get a line-by-line profile of a given function, install Robert Kern's +`line profiler`_ and: + +1. Import the :func:`~pyop2.profiling.profile` decorator: :: + + from pyop2.profiling import profile + +2. Decorate the function to profile with ``@profile`` +3. Run your script with ``kernprof.py -l `` +4. Generate an annotated source file with :: + + python -m line_profiler + +Note that ``kernprof.py`` injects the ``@profile`` decorator into the +Python builtins namespace. PyOP2 provides a passthrough version of this +decorator which does nothing if ``profile`` is not found in +``__builtins__``. This means you can run your script regularly without +having to remove the decorators again. + +The :func:`~pyop2.profiling.profile` decorator also works with the +memory profiler (see below). PyOP2 therefore provides the +:func:`~pyop2.profiling.lineprof` decorator which is only enabled when +running with ``kernprof.py``. + +A number of PyOP2 internal functions are decorated such that running +your PyOP2 application with ``kernprof.py`` will produce a line-by-line +profile of the parallel loop computation (but not the generated code!). + +Memory profiling +---------------- + +To profile the memory usage of your application, install Fabian +Pedregosa's `memory profiler`_ and: + +1. Import the :func:`~pyop2.profiling.profile` decorator: :: + + from pyop2.profiling import profile + +2. Decorate the function to profile with ``@profile``. +3. Run your script with :: + + python -m memory_profiler + + to get a line-by-line memory profile of your function. +4. Run your script with :: + + memprof run --python + + to record memory usage of your program over time. +5. Generate a plot of the memory profile with ``memprof plot``. + +Note that ``memprof`` and ``python -m memory_profiler`` inject the +``@profile`` decorator into the Python builtins namespace. PyOP2 +provides a passthrough version of this decorator which does nothing if +``profile`` is not found in ``__builtins__``. This means you can run +your script regularly without having to remove the decorators again. + +The :func:`~pyop2.profiling.profile` decorator also works with the line +profiler (see below). PyOP2 therefore provides the +:func:`~pyop2.profiling.memprof` decorator which is only enabled when +running with ``memprof``. + +A number of PyOP2 internal functions are decorated such that running +your PyOP2 application with ``memprof run`` will produce a memory +profile of the parallel loop computation (but not the generated code!). + +.. _cProfile: https://docs.python.org/2/library/profile.html#cProfile +.. _gprof2dot: https://code.google.com/p/jrfonseca/wiki/Gprof2Dot +.. _line profiler: https://pythonhosted.org/line_profiler/ +.. _memory profiler: https://github.com/fabianp/memory_profiler diff --git a/_sources/pyop2.codegen.rst.txt b/_sources/pyop2.codegen.rst.txt new file mode 100644 index 000000000..53e8253dd --- /dev/null +++ b/_sources/pyop2.codegen.rst.txt @@ -0,0 +1,61 @@ +pyop2.codegen package +===================== + +Submodules +---------- + +pyop2.codegen.builder module +---------------------------- + +.. automodule:: pyop2.codegen.builder + :members: + :undoc-members: + :show-inheritance: + +pyop2.codegen.loopycompat module +-------------------------------- + +.. automodule:: pyop2.codegen.loopycompat + :members: + :undoc-members: + :show-inheritance: + +pyop2.codegen.node module +------------------------- + +.. automodule:: pyop2.codegen.node + :members: + :undoc-members: + :show-inheritance: + +pyop2.codegen.optimise module +----------------------------- + +.. automodule:: pyop2.codegen.optimise + :members: + :undoc-members: + :show-inheritance: + +pyop2.codegen.rep2loopy module +------------------------------ + +.. automodule:: pyop2.codegen.rep2loopy + :members: + :undoc-members: + :show-inheritance: + +pyop2.codegen.representation module +----------------------------------- + +.. automodule:: pyop2.codegen.representation + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: pyop2.codegen + :members: + :undoc-members: + :show-inheritance: diff --git a/_sources/pyop2.rst.txt b/_sources/pyop2.rst.txt new file mode 100644 index 000000000..0078f2f33 --- /dev/null +++ b/_sources/pyop2.rst.txt @@ -0,0 +1,142 @@ +pyop2 package +============= + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + pyop2.codegen + pyop2.types + +Submodules +---------- + +pyop2.caching module +-------------------- + +.. automodule:: pyop2.caching + :members: + :undoc-members: + :show-inheritance: + +pyop2.compilation module +------------------------ + +.. automodule:: pyop2.compilation + :members: + :undoc-members: + :show-inheritance: + +pyop2.configuration module +-------------------------- + +.. automodule:: pyop2.configuration + :members: + :undoc-members: + :show-inheritance: + +pyop2.datatypes module +---------------------- + +.. automodule:: pyop2.datatypes + :members: + :undoc-members: + :show-inheritance: + +pyop2.exceptions module +----------------------- + +.. automodule:: pyop2.exceptions + :members: + :undoc-members: + :show-inheritance: + +pyop2.global\_kernel module +--------------------------- + +.. automodule:: pyop2.global_kernel + :members: + :undoc-members: + :show-inheritance: + +pyop2.local\_kernel module +-------------------------- + +.. automodule:: pyop2.local_kernel + :members: + :undoc-members: + :show-inheritance: + +pyop2.logger module +------------------- + +.. automodule:: pyop2.logger + :members: + :undoc-members: + :show-inheritance: + +pyop2.mpi module +---------------- + +.. automodule:: pyop2.mpi + :members: + :undoc-members: + :show-inheritance: + +pyop2.op2 module +---------------- + +.. automodule:: pyop2.op2 + :members: + :undoc-members: + :show-inheritance: + +pyop2.parloop module +-------------------- + +.. automodule:: pyop2.parloop + :members: + :undoc-members: + :show-inheritance: + +pyop2.profiling module +---------------------- + +.. automodule:: pyop2.profiling + :members: + :undoc-members: + :show-inheritance: + +pyop2.sparsity module +--------------------- + +.. automodule:: pyop2.sparsity + :members: + :undoc-members: + :show-inheritance: + +pyop2.utils module +------------------ + +.. automodule:: pyop2.utils + :members: + :undoc-members: + :show-inheritance: + +pyop2.version module +-------------------- + +.. automodule:: pyop2.version + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: pyop2 + :members: + :undoc-members: + :show-inheritance: diff --git a/_sources/pyop2.types.rst.txt b/_sources/pyop2.types.rst.txt new file mode 100644 index 000000000..543b170e0 --- /dev/null +++ b/_sources/pyop2.types.rst.txt @@ -0,0 +1,85 @@ +pyop2.types package +=================== + +Submodules +---------- + +pyop2.types.access module +------------------------- + +.. automodule:: pyop2.types.access + :members: + :undoc-members: + :show-inheritance: + +pyop2.types.dat module +---------------------- + +.. automodule:: pyop2.types.dat + :members: + :undoc-members: + :show-inheritance: + +pyop2.types.data\_carrier module +-------------------------------- + +.. automodule:: pyop2.types.data_carrier + :members: + :undoc-members: + :show-inheritance: + +pyop2.types.dataset module +-------------------------- + +.. automodule:: pyop2.types.dataset + :members: + :undoc-members: + :show-inheritance: + +pyop2.types.glob module +----------------------- + +.. automodule:: pyop2.types.glob + :members: + :undoc-members: + :show-inheritance: + +pyop2.types.halo module +----------------------- + +.. automodule:: pyop2.types.halo + :members: + :undoc-members: + :show-inheritance: + +pyop2.types.map module +---------------------- + +.. automodule:: pyop2.types.map + :members: + :undoc-members: + :show-inheritance: + +pyop2.types.mat module +---------------------- + +.. automodule:: pyop2.types.mat + :members: + :undoc-members: + :show-inheritance: + +pyop2.types.set module +---------------------- + +.. automodule:: pyop2.types.set + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: pyop2.types + :members: + :undoc-members: + :show-inheritance: diff --git a/_sources/user.rst.txt b/_sources/user.rst.txt new file mode 100644 index 000000000..c44b4d4c1 --- /dev/null +++ b/_sources/user.rst.txt @@ -0,0 +1,68 @@ +pyop2 user documentation +======================== + +:mod:`pyop2` Package +-------------------- + +.. automodule:: pyop2 + :members: + :show-inheritance: + :inherited-members: + + Initialization and finalization + ............................... + + .. autofunction:: init + .. autofunction:: exit + + Data structures + ............... + + .. autoclass:: Set + :inherited-members: + .. autoclass:: ExtrudedSet + :inherited-members: + .. autoclass:: Subset + :inherited-members: + .. autoclass:: MixedSet + :inherited-members: + .. autoclass:: DataSet + :inherited-members: + .. autoclass:: MixedDataSet + :inherited-members: + .. autoclass:: Map + :inherited-members: + .. autoclass:: MixedMap + :inherited-members: + .. autoclass:: Sparsity + :inherited-members: + + .. autoclass:: Const + :inherited-members: + .. autoclass:: Global + :inherited-members: + .. autoclass:: Dat + :inherited-members: + .. autoclass:: MixedDat + :inherited-members: + .. autoclass:: Mat + :inherited-members: + + Parallel loops, kernels and linear solves + ......................................... + + .. autofunction:: par_loop + .. autofunction:: solve + + .. autoclass:: Kernel + :inherited-members: + .. autoclass:: Solver + :inherited-members: + + .. autodata:: i + .. autodata:: READ + .. autodata:: WRITE + .. autodata:: RW + .. autodata:: INC + .. autodata:: MIN + .. autodata:: MAX diff --git a/_static/basic.css b/_static/basic.css new file mode 100644 index 000000000..f316efcb4 --- /dev/null +++ b/_static/basic.css @@ -0,0 +1,925 @@ +/* + * basic.css + * ~~~~~~~~~ + * + * Sphinx stylesheet -- basic theme. + * + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/* -- main layout ----------------------------------------------------------- */ + +div.clearer { + clear: both; +} + +div.section::after { + display: block; + content: ''; + clear: left; +} + +/* -- relbar ---------------------------------------------------------------- */ + +div.related { + width: 100%; + font-size: 90%; +} + +div.related h3 { + display: none; +} + +div.related ul { + margin: 0; + padding: 0 0 0 10px; + list-style: none; +} + +div.related li { + display: inline; +} + +div.related li.right { + float: right; + margin-right: 5px; +} + +/* -- sidebar --------------------------------------------------------------- */ + +div.sphinxsidebarwrapper { + padding: 10px 5px 0 10px; +} + +div.sphinxsidebar { + float: left; + width: 230px; + margin-left: -100%; + font-size: 90%; + word-wrap: break-word; + overflow-wrap : break-word; +} + +div.sphinxsidebar ul { + list-style: none; +} + +div.sphinxsidebar ul ul, +div.sphinxsidebar ul.want-points { + margin-left: 20px; + list-style: square; +} + +div.sphinxsidebar ul ul { + margin-top: 0; + margin-bottom: 0; +} + +div.sphinxsidebar form { + margin-top: 10px; +} + +div.sphinxsidebar input { + border: 1px solid #98dbcc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar #searchbox form.search { + overflow: hidden; +} + +div.sphinxsidebar #searchbox input[type="text"] { + float: left; + width: 80%; + padding: 0.25em; + box-sizing: border-box; +} + +div.sphinxsidebar #searchbox input[type="submit"] { + float: left; + width: 20%; + border-left: none; + padding: 0.25em; + box-sizing: border-box; +} + + +img { + border: 0; + max-width: 100%; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li p.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; + margin-left: auto; + margin-right: auto; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable { + width: 100%; +} + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +div.modindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +div.genindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- general body styles --------------------------------------------------- */ + +div.body { + min-width: 360px; + max-width: 800px; +} + +div.body p, div.body dd, div.body li, div.body blockquote { + -moz-hyphens: auto; + -ms-hyphens: auto; + -webkit-hyphens: auto; + hyphens: auto; +} + +a.headerlink { + visibility: hidden; +} + +a:visited { + color: #551A8B; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +div.body p.caption { + text-align: inherit; +} + +div.body td { + text-align: left; +} + +.first { + margin-top: 0 !important; +} + +p.rubric { + margin-top: 30px; + font-weight: bold; +} + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-default { + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- sidebars -------------------------------------------------------------- */ + +div.sidebar, +aside.sidebar { + margin: 0 0 0.5em 1em; + border: 1px solid #ddb; + padding: 7px; + background-color: #ffe; + width: 40%; + float: right; + clear: right; + overflow-x: auto; +} + +p.sidebar-title { + font-weight: bold; +} + +nav.contents, +aside.topic, +div.admonition, div.topic, blockquote { + clear: left; +} + +/* -- topics ---------------------------------------------------------------- */ + +nav.contents, +aside.topic, +div.topic { + border: 1px solid #ccc; + padding: 7px; + margin: 10px 0 10px 0; +} + +p.topic-title { + font-size: 1.1em; + font-weight: bold; + margin-top: 10px; +} + +/* -- admonitions ----------------------------------------------------------- */ + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 7px; +} + +div.admonition dt { + font-weight: bold; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +div.body p.centered { + text-align: center; + margin-top: 25px; +} + +/* -- content of sidebars/topics/admonitions -------------------------------- */ + +div.sidebar > :last-child, +aside.sidebar > :last-child, +nav.contents > :last-child, +aside.topic > :last-child, +div.topic > :last-child, +div.admonition > :last-child { + margin-bottom: 0; +} + +div.sidebar::after, +aside.sidebar::after, +nav.contents::after, +aside.topic::after, +div.topic::after, +div.admonition::after, +blockquote::after { + display: block; + content: ''; + clear: both; +} + +/* -- tables ---------------------------------------------------------------- */ + +table.docutils { + margin-top: 10px; + margin-bottom: 10px; + border: 0; + border-collapse: collapse; +} + +table.align-center { + margin-left: auto; + margin-right: auto; +} + +table.align-default { + margin-left: auto; + margin-right: auto; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +table.docutils td, table.docutils th { + padding: 1px 8px 1px 5px; + border-top: 0; + border-left: 0; + border-right: 0; + border-bottom: 1px solid #aaa; +} + +th { + text-align: left; + padding-right: 5px; +} + +table.citation { + border-left: solid 1px gray; + margin-left: 1px; +} + +table.citation td { + border-bottom: none; +} + +th > :first-child, +td > :first-child { + margin-top: 0px; +} + +th > :last-child, +td > :last-child { + margin-bottom: 0px; +} + +/* -- figures --------------------------------------------------------------- */ + +div.figure, figure { + margin: 0.5em; + padding: 0.5em; +} + +div.figure p.caption, figcaption { + padding: 0.3em; +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- field list styles ----------------------------------------------------- */ + +table.field-list td, table.field-list th { + border: 0 !important; +} + +.field-list ul { + margin: 0; + padding-left: 1em; +} + +.field-list p { + margin: 0; +} + +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +/* -- hlist styles ---------------------------------------------------------- */ + +table.hlist { + margin: 1em 0; +} + +table.hlist td { + vertical-align: top; +} + +/* -- object description styles --------------------------------------------- */ + +.sig { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; +} + +.sig-name, code.descname { + background-color: transparent; + font-weight: bold; +} + +.sig-name { + font-size: 1.1em; +} + +code.descname { + font-size: 1.2em; +} + +.sig-prename, code.descclassname { + background-color: transparent; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.sig-param.n { + font-style: italic; +} + +/* C++ specific styling */ + +.sig-inline.c-texpr, +.sig-inline.cpp-texpr { + font-family: unset; +} + +.sig.c .k, .sig.c .kt, +.sig.cpp .k, .sig.cpp .kt { + color: #0033B3; +} + +.sig.c .m, +.sig.cpp .m { + color: #1750EB; +} + +.sig.c .s, .sig.c .sc, +.sig.cpp .s, .sig.cpp .sc { + color: #067D17; +} + + +/* -- other body styles ----------------------------------------------------- */ + +ol.arabic { + list-style: decimal; +} + +ol.loweralpha { + list-style: lower-alpha; +} + +ol.upperalpha { + list-style: upper-alpha; +} + +ol.lowerroman { + list-style: lower-roman; +} + +ol.upperroman { + list-style: upper-roman; +} + +:not(li) > ol > li:first-child > :first-child, +:not(li) > ul > li:first-child > :first-child { + margin-top: 0px; +} + +:not(li) > ol > li:last-child > :last-child, +:not(li) > ul > li:last-child > :last-child { + margin-bottom: 0px; +} + +ol.simple ol p, +ol.simple ul p, +ul.simple ol p, +ul.simple ul p { + margin-top: 0; +} + +ol.simple > li:not(:first-child) > p, +ul.simple > li:not(:first-child) > p { + margin-top: 0; +} + +ol.simple p, +ul.simple p { + margin-bottom: 0; +} + +aside.footnote > span, +div.citation > span { + float: left; +} +aside.footnote > span:last-of-type, +div.citation > span:last-of-type { + padding-right: 0.5em; +} +aside.footnote > p { + margin-left: 2em; +} +div.citation > p { + margin-left: 4em; +} +aside.footnote > p:last-of-type, +div.citation > p:last-of-type { + margin-bottom: 0em; +} +aside.footnote > p:last-of-type:after, +div.citation > p:last-of-type:after { + content: ""; + clear: both; +} + +dl.field-list { + display: grid; + grid-template-columns: fit-content(30%) auto; +} + +dl.field-list > dt { + font-weight: bold; + word-break: break-word; + padding-left: 0.5em; + padding-right: 5px; +} + +dl.field-list > dd { + padding-left: 0.5em; + margin-top: 0em; + margin-left: 0em; + margin-bottom: 0em; +} + +dl { + margin-bottom: 15px; +} + +dd > :first-child { + margin-top: 0px; +} + +dd ul, dd table { + margin-bottom: 10px; +} + +dd { + margin-top: 3px; + margin-bottom: 10px; + margin-left: 30px; +} + +.sig dd { + margin-top: 0px; + margin-bottom: 0px; +} + +.sig dl { + margin-top: 0px; + margin-bottom: 0px; +} + +dl > dd:last-child, +dl > dd:last-child > :last-child { + margin-bottom: 0; +} + +dt:target, span.highlighted { + background-color: #fbe54e; +} + +rect.highlighted { + fill: #fbe54e; +} + +dl.glossary dt { + font-weight: bold; + font-size: 1.1em; +} + +.versionmodified { + font-style: italic; +} + +.system-message { + background-color: #fda; + padding: 5px; + border: 3px solid red; +} + +.footnote:target { + background-color: #ffa; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +.guilabel, .menuselection { + font-family: sans-serif; +} + +.accelerator { + text-decoration: underline; +} + +.classifier { + font-style: oblique; +} + +.classifier:before { + font-style: normal; + margin: 0 0.5em; + content: ":"; + display: inline-block; +} + +abbr, acronym { + border-bottom: dotted 1px; + cursor: help; +} + +.translated { + background-color: rgba(207, 255, 207, 0.2) +} + +.untranslated { + background-color: rgba(255, 207, 207, 0.2) +} + +/* -- code displays --------------------------------------------------------- */ + +pre { + overflow: auto; + overflow-y: hidden; /* fixes display issues on Chrome browsers */ +} + +pre, div[class*="highlight-"] { + clear: both; +} + +span.pre { + -moz-hyphens: none; + -ms-hyphens: none; + -webkit-hyphens: none; + hyphens: none; + white-space: nowrap; +} + +div[class*="highlight-"] { + margin: 1em 0; +} + +td.linenos pre { + border: 0; + background-color: transparent; + color: #aaa; +} + +table.highlighttable { + display: block; +} + +table.highlighttable tbody { + display: block; +} + +table.highlighttable tr { + display: flex; +} + +table.highlighttable td { + margin: 0; + padding: 0; +} + +table.highlighttable td.linenos { + padding-right: 0.5em; +} + +table.highlighttable td.code { + flex: 1; + overflow: hidden; +} + +.highlight .hll { + display: block; +} + +div.highlight pre, +table.highlighttable pre { + margin: 0; +} + +div.code-block-caption + div { + margin-top: 0; +} + +div.code-block-caption { + margin-top: 1em; + padding: 2px 5px; + font-size: small; +} + +div.code-block-caption code { + background-color: transparent; +} + +table.highlighttable td.linenos, +span.linenos, +div.highlight span.gp { /* gp: Generic.Prompt */ + user-select: none; + -webkit-user-select: text; /* Safari fallback only */ + -webkit-user-select: none; /* Chrome/Safari */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* IE10+ */ +} + +div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.code-block-caption span.caption-text { +} + +div.literal-block-wrapper { + margin: 1em 0; +} + +code.xref, a code { + background-color: transparent; + font-weight: bold; +} + +h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { + background-color: transparent; +} + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family: sans-serif; +} + +div.viewcode-block:target { + margin: -1px -10px; + padding: 0 10px; +} + +/* -- math display ---------------------------------------------------------- */ + +img.math { + vertical-align: middle; +} + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} + +span.eqno a.headerlink { + position: absolute; + z-index: 1; +} + +div.math:hover a.headerlink { + visibility: visible; +} + +/* -- printout stylesheet --------------------------------------------------- */ + +@media print { + div.document, + div.documentwrapper, + div.bodywrapper { + margin: 0 !important; + width: 100%; + } + + div.sphinxsidebar, + div.related, + div.footer, + #top-link { + display: none; + } +} \ No newline at end of file diff --git a/_static/classic.css b/_static/classic.css new file mode 100644 index 000000000..55301478f --- /dev/null +++ b/_static/classic.css @@ -0,0 +1,269 @@ +/* + * classic.css_t + * ~~~~~~~~~~~~~ + * + * Sphinx stylesheet -- classic theme. + * + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +@import url("basic.css"); + +/* -- page layout ----------------------------------------------------------- */ + +html { + /* CSS hack for macOS's scrollbar (see #1125) */ + background-color: #FFFFFF; +} + +body { + font-family: sans-serif; + font-size: 100%; + background-color: #11303d; + color: #000; + margin: 0; + padding: 0; +} + +div.document { + display: flex; + background-color: #1c4e63; +} + +div.documentwrapper { + float: left; + width: 100%; +} + +div.bodywrapper { + margin: 0 0 0 230px; +} + +div.body { + background-color: #ffffff; + color: #000000; + padding: 0 20px 30px 20px; +} + +div.footer { + color: #ffffff; + width: 100%; + padding: 9px 0 9px 0; + text-align: center; + font-size: 75%; +} + +div.footer a { + color: #ffffff; + text-decoration: underline; +} + +div.related { + background-color: #133f52; + line-height: 30px; + color: #ffffff; +} + +div.related a { + color: #ffffff; +} + +div.sphinxsidebar { +} + +div.sphinxsidebar h3 { + font-family: 'Trebuchet MS', sans-serif; + color: #ffffff; + font-size: 1.4em; + font-weight: normal; + margin: 0; + padding: 0; +} + +div.sphinxsidebar h3 a { + color: #ffffff; +} + +div.sphinxsidebar h4 { + font-family: 'Trebuchet MS', sans-serif; + color: #ffffff; + font-size: 1.3em; + font-weight: normal; + margin: 5px 0 0 0; + padding: 0; +} + +div.sphinxsidebar p { + color: #ffffff; +} + +div.sphinxsidebar p.topless { + margin: 5px 10px 10px 10px; +} + +div.sphinxsidebar ul { + margin: 10px; + padding: 0; + color: #ffffff; +} + +div.sphinxsidebar a { + color: #98dbcc; +} + +div.sphinxsidebar input { + border: 1px solid #98dbcc; + font-family: sans-serif; + font-size: 1em; +} + + + +/* -- hyperlink styles ------------------------------------------------------ */ + +a { + color: #355f7c; + text-decoration: none; +} + +a:visited { + color: #551a8b; + text-decoration: none; +} + +a:hover { + text-decoration: underline; +} + + + +/* -- body styles ----------------------------------------------------------- */ + +div.body h1, +div.body h2, +div.body h3, +div.body h4, +div.body h5, +div.body h6 { + font-family: 'Trebuchet MS', sans-serif; + background-color: #f2f2f2; + font-weight: normal; + color: #20435c; + border-bottom: 1px solid #ccc; + margin: 20px -20px 10px -20px; + padding: 3px 0 3px 10px; +} + +div.body h1 { margin-top: 0; font-size: 200%; } +div.body h2 { font-size: 160%; } +div.body h3 { font-size: 140%; } +div.body h4 { font-size: 120%; } +div.body h5 { font-size: 110%; } +div.body h6 { font-size: 100%; } + +a.headerlink { + color: #c60f0f; + font-size: 0.8em; + padding: 0 4px 0 4px; + text-decoration: none; +} + +a.headerlink:hover { + background-color: #c60f0f; + color: white; +} + +div.body p, div.body dd, div.body li, div.body blockquote { + text-align: justify; + line-height: 130%; +} + +div.admonition p.admonition-title + p { + display: inline; +} + +div.admonition p { + margin-bottom: 5px; +} + +div.admonition pre { + margin-bottom: 5px; +} + +div.admonition ul, div.admonition ol { + margin-bottom: 5px; +} + +div.note { + background-color: #eee; + border: 1px solid #ccc; +} + +div.seealso { + background-color: #ffc; + border: 1px solid #ff6; +} + +nav.contents, +aside.topic, +div.topic { + background-color: #eee; +} + +div.warning { + background-color: #ffe4e4; + border: 1px solid #f66; +} + +p.admonition-title { + display: inline; +} + +p.admonition-title:after { + content: ":"; +} + +pre { + padding: 5px; + background-color: unset; + color: unset; + line-height: 120%; + border: 1px solid #ac9; + border-left: none; + border-right: none; +} + +code { + background-color: #ecf0f3; + padding: 0 1px 0 1px; + font-size: 0.95em; +} + +th, dl.field-list > dt { + background-color: #ede; +} + +.warning code { + background: #efc2c2; +} + +.note code { + background: #d6d6d6; +} + +.viewcode-back { + font-family: sans-serif; +} + +div.viewcode-block:target { + background-color: #f4debf; + border-top: 1px solid #ac9; + border-bottom: 1px solid #ac9; +} + +div.code-block-caption { + color: #efefef; + background-color: #1c4e63; +} \ No newline at end of file diff --git a/_static/default.css b/_static/default.css new file mode 100644 index 000000000..81b936363 --- /dev/null +++ b/_static/default.css @@ -0,0 +1 @@ +@import url("classic.css"); diff --git a/_static/doctools.js b/_static/doctools.js new file mode 100644 index 000000000..4d67807d1 --- /dev/null +++ b/_static/doctools.js @@ -0,0 +1,156 @@ +/* + * doctools.js + * ~~~~~~~~~~~ + * + * Base JavaScript utilities for all Sphinx HTML documentation. + * + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ +"use strict"; + +const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([ + "TEXTAREA", + "INPUT", + "SELECT", + "BUTTON", +]); + +const _ready = (callback) => { + if (document.readyState !== "loading") { + callback(); + } else { + document.addEventListener("DOMContentLoaded", callback); + } +}; + +/** + * Small JavaScript module for the documentation. + */ +const Documentation = { + init: () => { + Documentation.initDomainIndexTable(); + Documentation.initOnKeyListeners(); + }, + + /** + * i18n support + */ + TRANSLATIONS: {}, + PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), + LOCALE: "unknown", + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext: (string) => { + const translated = Documentation.TRANSLATIONS[string]; + switch (typeof translated) { + case "undefined": + return string; // no translation + case "string": + return translated; // translation exists + default: + return translated[0]; // (singular, plural) translation tuple exists + } + }, + + ngettext: (singular, plural, n) => { + const translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated !== "undefined") + return translated[Documentation.PLURAL_EXPR(n)]; + return n === 1 ? singular : plural; + }, + + addTranslations: (catalog) => { + Object.assign(Documentation.TRANSLATIONS, catalog.messages); + Documentation.PLURAL_EXPR = new Function( + "n", + `return (${catalog.plural_expr})` + ); + Documentation.LOCALE = catalog.locale; + }, + + /** + * helper function to focus on search bar + */ + focusSearchBar: () => { + document.querySelectorAll("input[name=q]")[0]?.focus(); + }, + + /** + * Initialise the domain index toggle buttons + */ + initDomainIndexTable: () => { + const toggler = (el) => { + const idNumber = el.id.substr(7); + const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); + if (el.src.substr(-9) === "minus.png") { + el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; + toggledRows.forEach((el) => (el.style.display = "none")); + } else { + el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; + toggledRows.forEach((el) => (el.style.display = "")); + } + }; + + const togglerElements = document.querySelectorAll("img.toggler"); + togglerElements.forEach((el) => + el.addEventListener("click", (event) => toggler(event.currentTarget)) + ); + togglerElements.forEach((el) => (el.style.display = "")); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); + }, + + initOnKeyListeners: () => { + // only install a listener if it is really needed + if ( + !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && + !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS + ) + return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.altKey || event.ctrlKey || event.metaKey) return; + + if (!event.shiftKey) { + switch (event.key) { + case "ArrowLeft": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const prevLink = document.querySelector('link[rel="prev"]'); + if (prevLink && prevLink.href) { + window.location.href = prevLink.href; + event.preventDefault(); + } + break; + case "ArrowRight": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const nextLink = document.querySelector('link[rel="next"]'); + if (nextLink && nextLink.href) { + window.location.href = nextLink.href; + event.preventDefault(); + } + break; + } + } + + // some keyboard layouts may need Shift to get / + switch (event.key) { + case "/": + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; + Documentation.focusSearchBar(); + event.preventDefault(); + } + }); + }, +}; + +// quick alias for translations +const _ = Documentation.gettext; + +_ready(Documentation.init); diff --git a/_static/documentation_options.js b/_static/documentation_options.js new file mode 100644 index 000000000..91676b19b --- /dev/null +++ b/_static/documentation_options.js @@ -0,0 +1,13 @@ +const DOCUMENTATION_OPTIONS = { + VERSION: '2020.0', + LANGUAGE: 'en', + COLLAPSE_INDEX: false, + BUILDER: 'html', + FILE_SUFFIX: '.html', + LINK_SUFFIX: '.html', + HAS_SOURCE: true, + SOURCELINK_SUFFIX: '.txt', + NAVIGATION_WITH_KEYS: false, + SHOW_SEARCH_SUMMARY: true, + ENABLE_SEARCH_SHORTCUTS: true, +}; \ No newline at end of file diff --git a/_static/file.png b/_static/file.png new file mode 100644 index 0000000000000000000000000000000000000000..a858a410e4faa62ce324d814e4b816fff83a6fb3 GIT binary patch literal 286 zcmV+(0pb3MP)s`hMrGg#P~ix$^RISR_I47Y|r1 z_CyJOe}D1){SET-^Amu_i71Lt6eYfZjRyw@I6OQAIXXHDfiX^GbOlHe=Ae4>0m)d(f|Me07*qoM6N<$f}vM^LjV8( literal 0 HcmV?d00001 diff --git a/_static/language_data.js b/_static/language_data.js new file mode 100644 index 000000000..367b8ed81 --- /dev/null +++ b/_static/language_data.js @@ -0,0 +1,199 @@ +/* + * language_data.js + * ~~~~~~~~~~~~~~~~ + * + * This script contains the language-specific data used by searchtools.js, + * namely the list of stopwords, stemmer, scorer and splitter. + * + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; + + +/* Non-minified version is copied as a separate JS file, if available */ + +/** + * Porter Stemmer + */ +var Stemmer = function() { + + var step2list = { + ational: 'ate', + tional: 'tion', + enci: 'ence', + anci: 'ance', + izer: 'ize', + bli: 'ble', + alli: 'al', + entli: 'ent', + eli: 'e', + ousli: 'ous', + ization: 'ize', + ation: 'ate', + ator: 'ate', + alism: 'al', + iveness: 'ive', + fulness: 'ful', + ousness: 'ous', + aliti: 'al', + iviti: 'ive', + biliti: 'ble', + logi: 'log' + }; + + var step3list = { + icate: 'ic', + ative: '', + alize: 'al', + iciti: 'ic', + ical: 'ic', + ful: '', + ness: '' + }; + + var c = "[^aeiou]"; // consonant + var v = "[aeiouy]"; // vowel + var C = c + "[^aeiouy]*"; // consonant sequence + var V = v + "[aeiou]*"; // vowel sequence + + var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 + var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 + var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 + var s_v = "^(" + C + ")?" + v; // vowel in stem + + this.stemWord = function (w) { + var stem; + var suffix; + var firstch; + var origword = w; + + if (w.length < 3) + return w; + + var re; + var re2; + var re3; + var re4; + + firstch = w.substr(0,1); + if (firstch == "y") + w = firstch.toUpperCase() + w.substr(1); + + // Step 1a + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) + w = w.replace(re,"$1$2"); + else if (re2.test(w)) + w = w.replace(re2,"$1$2"); + + // Step 1b + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + var fp = re.exec(w); + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re,""); + } + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) + w = w + "e"; + else if (re3.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + else if (re4.test(w)) + w = w + "e"; + } + } + + // Step 1c + re = /^(.+?)y$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(s_v); + if (re.test(stem)) + w = stem + "i"; + } + + // Step 2 + re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step2list[suffix]; + } + + // Step 3 + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step3list[suffix]; + } + + // Step 4 + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + if (re.test(stem)) + w = stem; + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1] + fp[2]; + re2 = new RegExp(mgr1); + if (re2.test(stem)) + w = stem; + } + + // Step 5 + re = /^(.+?)e$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) + w = stem; + } + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + + // and turn initial Y back to y + if (firstch == "y") + w = firstch.toLowerCase() + w.substr(1); + return w; + } +} + diff --git a/_static/minus.png b/_static/minus.png new file mode 100644 index 0000000000000000000000000000000000000000..d96755fdaf8bb2214971e0db9c1fd3077d7c419d GIT binary patch literal 90 zcmeAS@N?(olHy`uVBq!ia0vp^+#t*WBp7;*Yy1LIik>cxAr*|t7R?Mi>2?kWtu=nj kDsEF_5m^0CR;1wuP-*O&G^0G}KYk!hp00i_>zopr08q^qX#fBK literal 0 HcmV?d00001 diff --git a/_static/plus.png b/_static/plus.png new file mode 100644 index 0000000000000000000000000000000000000000..7107cec93a979b9a5f64843235a16651d563ce2d GIT binary patch literal 90 zcmeAS@N?(olHy`uVBq!ia0vp^+#t*WBp7;*Yy1LIik>cxAr*|t7R?Mi>2?kWtu>-2 m3q%Vub%g%s<8sJhVPMczOq}xhg9DJoz~JfX=d#Wzp$Pyb1r*Kz literal 0 HcmV?d00001 diff --git a/_static/pygments.css b/_static/pygments.css new file mode 100644 index 000000000..0d49244ed --- /dev/null +++ b/_static/pygments.css @@ -0,0 +1,75 @@ +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +.highlight .hll { background-color: #ffffcc } +.highlight { background: #eeffcc; } +.highlight .c { color: #408090; font-style: italic } /* Comment */ +.highlight .err { border: 1px solid #FF0000 } /* Error */ +.highlight .k { color: #007020; font-weight: bold } /* Keyword */ +.highlight .o { color: #666666 } /* Operator */ +.highlight .ch { color: #408090; font-style: italic } /* Comment.Hashbang */ +.highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */ +.highlight .cp { color: #007020 } /* Comment.Preproc */ +.highlight .cpf { color: #408090; font-style: italic } /* Comment.PreprocFile */ +.highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */ +.highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */ +.highlight .gd { color: #A00000 } /* Generic.Deleted */ +.highlight .ge { font-style: italic } /* Generic.Emph */ +.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +.highlight .gr { color: #FF0000 } /* Generic.Error */ +.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +.highlight .gi { color: #00A000 } /* Generic.Inserted */ +.highlight .go { color: #333333 } /* Generic.Output */ +.highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ +.highlight .gs { font-weight: bold } /* Generic.Strong */ +.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +.highlight .gt { color: #0044DD } /* Generic.Traceback */ +.highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ +.highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ +.highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ +.highlight .kp { color: #007020 } /* Keyword.Pseudo */ +.highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ +.highlight .kt { color: #902000 } /* Keyword.Type */ +.highlight .m { color: #208050 } /* Literal.Number */ +.highlight .s { color: #4070a0 } /* Literal.String */ +.highlight .na { color: #4070a0 } /* Name.Attribute */ +.highlight .nb { color: #007020 } /* Name.Builtin */ +.highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ +.highlight .no { color: #60add5 } /* Name.Constant */ +.highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ +.highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ +.highlight .ne { color: #007020 } /* Name.Exception */ +.highlight .nf { color: #06287e } /* Name.Function */ +.highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ +.highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ +.highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ +.highlight .nv { color: #bb60d5 } /* Name.Variable */ +.highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ +.highlight .w { color: #bbbbbb } /* Text.Whitespace */ +.highlight .mb { color: #208050 } /* Literal.Number.Bin */ +.highlight .mf { color: #208050 } /* Literal.Number.Float */ +.highlight .mh { color: #208050 } /* Literal.Number.Hex */ +.highlight .mi { color: #208050 } /* Literal.Number.Integer */ +.highlight .mo { color: #208050 } /* Literal.Number.Oct */ +.highlight .sa { color: #4070a0 } /* Literal.String.Affix */ +.highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ +.highlight .sc { color: #4070a0 } /* Literal.String.Char */ +.highlight .dl { color: #4070a0 } /* Literal.String.Delimiter */ +.highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ +.highlight .s2 { color: #4070a0 } /* Literal.String.Double */ +.highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ +.highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ +.highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ +.highlight .sx { color: #c65d09 } /* Literal.String.Other */ +.highlight .sr { color: #235388 } /* Literal.String.Regex */ +.highlight .s1 { color: #4070a0 } /* Literal.String.Single */ +.highlight .ss { color: #517918 } /* Literal.String.Symbol */ +.highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ +.highlight .fm { color: #06287e } /* Name.Function.Magic */ +.highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ +.highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ +.highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ +.highlight .vm { color: #bb60d5 } /* Name.Variable.Magic */ +.highlight .il { color: #208050 } /* Literal.Number.Integer.Long */ \ No newline at end of file diff --git a/_static/searchtools.js b/_static/searchtools.js new file mode 100644 index 000000000..92da3f8b2 --- /dev/null +++ b/_static/searchtools.js @@ -0,0 +1,619 @@ +/* + * searchtools.js + * ~~~~~~~~~~~~~~~~ + * + * Sphinx JavaScript utilities for the full-text search. + * + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ +"use strict"; + +/** + * Simple result scoring code. + */ +if (typeof Scorer === "undefined") { + var Scorer = { + // Implement the following function to further tweak the score for each result + // The function takes a result array [docname, title, anchor, descr, score, filename] + // and returns the new score. + /* + score: result => { + const [docname, title, anchor, descr, score, filename] = result + return score + }, + */ + + // query matches the full name of an object + objNameMatch: 11, + // or matches in the last dotted part of the object name + objPartialMatch: 6, + // Additive scores depending on the priority of the object + objPrio: { + 0: 15, // used to be importantResults + 1: 5, // used to be objectResults + 2: -5, // used to be unimportantResults + }, + // Used when the priority is not in the mapping. + objPrioDefault: 0, + + // query found in title + title: 15, + partialTitle: 7, + // query found in terms + term: 5, + partialTerm: 2, + }; +} + +const _removeChildren = (element) => { + while (element && element.lastChild) element.removeChild(element.lastChild); +}; + +/** + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping + */ +const _escapeRegExp = (string) => + string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string + +const _displayItem = (item, searchTerms, highlightTerms) => { + const docBuilder = DOCUMENTATION_OPTIONS.BUILDER; + const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX; + const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX; + const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; + const contentRoot = document.documentElement.dataset.content_root; + + const [docName, title, anchor, descr, score, _filename] = item; + + let listItem = document.createElement("li"); + let requestUrl; + let linkUrl; + if (docBuilder === "dirhtml") { + // dirhtml builder + let dirname = docName + "/"; + if (dirname.match(/\/index\/$/)) + dirname = dirname.substring(0, dirname.length - 6); + else if (dirname === "index/") dirname = ""; + requestUrl = contentRoot + dirname; + linkUrl = requestUrl; + } else { + // normal html builders + requestUrl = contentRoot + docName + docFileSuffix; + linkUrl = docName + docLinkSuffix; + } + let linkEl = listItem.appendChild(document.createElement("a")); + linkEl.href = linkUrl + anchor; + linkEl.dataset.score = score; + linkEl.innerHTML = title; + if (descr) { + listItem.appendChild(document.createElement("span")).innerHTML = + " (" + descr + ")"; + // highlight search terms in the description + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + } + else if (showSearchSummary) + fetch(requestUrl) + .then((responseData) => responseData.text()) + .then((data) => { + if (data) + listItem.appendChild( + Search.makeSearchSummary(data, searchTerms, anchor) + ); + // highlight search terms in the summary + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + }); + Search.output.appendChild(listItem); +}; +const _finishSearch = (resultCount) => { + Search.stopPulse(); + Search.title.innerText = _("Search Results"); + if (!resultCount) + Search.status.innerText = Documentation.gettext( + "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." + ); + else + Search.status.innerText = _( + "Search finished, found ${resultCount} page(s) matching the search query." + ).replace('${resultCount}', resultCount); +}; +const _displayNextItem = ( + results, + resultCount, + searchTerms, + highlightTerms, +) => { + // results left, load the summary and display it + // this is intended to be dynamic (don't sub resultsCount) + if (results.length) { + _displayItem(results.pop(), searchTerms, highlightTerms); + setTimeout( + () => _displayNextItem(results, resultCount, searchTerms, highlightTerms), + 5 + ); + } + // search finished, update title and status message + else _finishSearch(resultCount); +}; +// Helper function used by query() to order search results. +// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Order the results by score (in opposite order of appearance, since the +// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. +const _orderResultsByScoreThenName = (a, b) => { + const leftScore = a[4]; + const rightScore = b[4]; + if (leftScore === rightScore) { + // same score: sort alphabetically + const leftTitle = a[1].toLowerCase(); + const rightTitle = b[1].toLowerCase(); + if (leftTitle === rightTitle) return 0; + return leftTitle > rightTitle ? -1 : 1; // inverted is intentional + } + return leftScore > rightScore ? 1 : -1; +}; + +/** + * Default splitQuery function. Can be overridden in ``sphinx.search`` with a + * custom function per language. + * + * The regular expression works by splitting the string on consecutive characters + * that are not Unicode letters, numbers, underscores, or emoji characters. + * This is the same as ``\W+`` in Python, preserving the surrogate pair area. + */ +if (typeof splitQuery === "undefined") { + var splitQuery = (query) => query + .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu) + .filter(term => term) // remove remaining empty strings +} + +/** + * Search Module + */ +const Search = { + _index: null, + _queued_query: null, + _pulse_status: -1, + + htmlToText: (htmlString, anchor) => { + const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); + for (const removalQuery of [".headerlinks", "script", "style"]) { + htmlElement.querySelectorAll(removalQuery).forEach((el) => { el.remove() }); + } + if (anchor) { + const anchorContent = htmlElement.querySelector(`[role="main"] ${anchor}`); + if (anchorContent) return anchorContent.textContent; + + console.warn( + `Anchored content block not found. Sphinx search tries to obtain it via DOM query '[role=main] ${anchor}'. Check your theme or template.` + ); + } + + // if anchor not specified or not found, fall back to main content + const docContent = htmlElement.querySelector('[role="main"]'); + if (docContent) return docContent.textContent; + + console.warn( + "Content block not found. Sphinx search tries to obtain it via DOM query '[role=main]'. Check your theme or template." + ); + return ""; + }, + + init: () => { + const query = new URLSearchParams(window.location.search).get("q"); + document + .querySelectorAll('input[name="q"]') + .forEach((el) => (el.value = query)); + if (query) Search.performSearch(query); + }, + + loadIndex: (url) => + (document.body.appendChild(document.createElement("script")).src = url), + + setIndex: (index) => { + Search._index = index; + if (Search._queued_query !== null) { + const query = Search._queued_query; + Search._queued_query = null; + Search.query(query); + } + }, + + hasIndex: () => Search._index !== null, + + deferQuery: (query) => (Search._queued_query = query), + + stopPulse: () => (Search._pulse_status = -1), + + startPulse: () => { + if (Search._pulse_status >= 0) return; + + const pulse = () => { + Search._pulse_status = (Search._pulse_status + 1) % 4; + Search.dots.innerText = ".".repeat(Search._pulse_status); + if (Search._pulse_status >= 0) window.setTimeout(pulse, 500); + }; + pulse(); + }, + + /** + * perform a search for something (or wait until index is loaded) + */ + performSearch: (query) => { + // create the required interface elements + const searchText = document.createElement("h2"); + searchText.textContent = _("Searching"); + const searchSummary = document.createElement("p"); + searchSummary.classList.add("search-summary"); + searchSummary.innerText = ""; + const searchList = document.createElement("ul"); + searchList.classList.add("search"); + + const out = document.getElementById("search-results"); + Search.title = out.appendChild(searchText); + Search.dots = Search.title.appendChild(document.createElement("span")); + Search.status = out.appendChild(searchSummary); + Search.output = out.appendChild(searchList); + + const searchProgress = document.getElementById("search-progress"); + // Some themes don't use the search progress node + if (searchProgress) { + searchProgress.innerText = _("Preparing search..."); + } + Search.startPulse(); + + // index already loaded, the browser was quick! + if (Search.hasIndex()) Search.query(query); + else Search.deferQuery(query); + }, + + _parseQuery: (query) => { + // stem the search terms and add them to the correct list + const stemmer = new Stemmer(); + const searchTerms = new Set(); + const excludedTerms = new Set(); + const highlightTerms = new Set(); + const objectTerms = new Set(splitQuery(query.toLowerCase().trim())); + splitQuery(query.trim()).forEach((queryTerm) => { + const queryTermLower = queryTerm.toLowerCase(); + + // maybe skip this "word" + // stopwords array is from language_data.js + if ( + stopwords.indexOf(queryTermLower) !== -1 || + queryTerm.match(/^\d+$/) + ) + return; + + // stem the word + let word = stemmer.stemWord(queryTermLower); + // select the correct list + if (word[0] === "-") excludedTerms.add(word.substr(1)); + else { + searchTerms.add(word); + highlightTerms.add(queryTermLower); + } + }); + + if (SPHINX_HIGHLIGHT_ENABLED) { // set in sphinx_highlight.js + localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" ")) + } + + // console.debug("SEARCH: searching for:"); + // console.info("required: ", [...searchTerms]); + // console.info("excluded: ", [...excludedTerms]); + + return [query, searchTerms, excludedTerms, highlightTerms, objectTerms]; + }, + + /** + * execute search (requires search index to be loaded) + */ + _performSearch: (query, searchTerms, excludedTerms, highlightTerms, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + const allTitles = Search._index.alltitles; + const indexEntries = Search._index.indexentries; + + // Collect multiple result groups to be sorted separately and then ordered. + // Each is an array of [docname, title, anchor, descr, score, filename]. + const normalResults = []; + const nonMainIndexResults = []; + + _removeChildren(document.getElementById("search-progress")); + + const queryLower = query.toLowerCase().trim(); + for (const [title, foundTitles] of Object.entries(allTitles)) { + if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) { + for (const [file, id] of foundTitles) { + let score = Math.round(100 * queryLower.length / title.length) + normalResults.push([ + docNames[file], + titles[file] !== title ? `${titles[file]} > ${title}` : title, + id !== null ? "#" + id : "", + null, + score, + filenames[file], + ]); + } + } + } + + // search for explicit entries in index directives + for (const [entry, foundEntries] of Object.entries(indexEntries)) { + if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { + for (const [file, id, isMain] of foundEntries) { + const score = Math.round(100 * queryLower.length / entry.length); + const result = [ + docNames[file], + titles[file], + id ? "#" + id : "", + null, + score, + filenames[file], + ]; + if (isMain) { + normalResults.push(result); + } else { + nonMainIndexResults.push(result); + } + } + } + } + + // lookup as object + objectTerms.forEach((term) => + normalResults.push(...Search.performObjectSearch(term, objectTerms)) + ); + + // lookup as search terms in fulltext + normalResults.push(...Search.performTermsSearch(searchTerms, excludedTerms)); + + // let the scorer override scores with a custom scoring function + if (Scorer.score) { + normalResults.forEach((item) => (item[4] = Scorer.score(item))); + nonMainIndexResults.forEach((item) => (item[4] = Scorer.score(item))); + } + + // Sort each group of results by score and then alphabetically by name. + normalResults.sort(_orderResultsByScoreThenName); + nonMainIndexResults.sort(_orderResultsByScoreThenName); + + // Combine the result groups in (reverse) order. + // Non-main index entries are typically arbitrary cross-references, + // so display them after other results. + let results = [...nonMainIndexResults, ...normalResults]; + + // remove duplicate search results + // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept + let seen = new Set(); + results = results.reverse().reduce((acc, result) => { + let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(','); + if (!seen.has(resultStr)) { + acc.push(result); + seen.add(resultStr); + } + return acc; + }, []); + + return results.reverse(); + }, + + query: (query) => { + const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query); + const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms); + + // for debugging + //Search.lastresults = results.slice(); // a copy + // console.info("search results:", Search.lastresults); + + // print the results + _displayNextItem(results, results.length, searchTerms, highlightTerms); + }, + + /** + * search for object names + */ + performObjectSearch: (object, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const objects = Search._index.objects; + const objNames = Search._index.objnames; + const titles = Search._index.titles; + + const results = []; + + const objectSearchCallback = (prefix, match) => { + const name = match[4] + const fullname = (prefix ? prefix + "." : "") + name; + const fullnameLower = fullname.toLowerCase(); + if (fullnameLower.indexOf(object) < 0) return; + + let score = 0; + const parts = fullnameLower.split("."); + + // check for different match types: exact matches of full name or + // "last name" (i.e. last dotted part) + if (fullnameLower === object || parts.slice(-1)[0] === object) + score += Scorer.objNameMatch; + else if (parts.slice(-1)[0].indexOf(object) > -1) + score += Scorer.objPartialMatch; // matches in last name + + const objName = objNames[match[1]][2]; + const title = titles[match[0]]; + + // If more than one term searched for, we require other words to be + // found in the name/title/description + const otherTerms = new Set(objectTerms); + otherTerms.delete(object); + if (otherTerms.size > 0) { + const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase(); + if ( + [...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0) + ) + return; + } + + let anchor = match[3]; + if (anchor === "") anchor = fullname; + else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname; + + const descr = objName + _(", in ") + title; + + // add custom score for some objects according to scorer + if (Scorer.objPrio.hasOwnProperty(match[2])) + score += Scorer.objPrio[match[2]]; + else score += Scorer.objPrioDefault; + + results.push([ + docNames[match[0]], + fullname, + "#" + anchor, + descr, + score, + filenames[match[0]], + ]); + }; + Object.keys(objects).forEach((prefix) => + objects[prefix].forEach((array) => + objectSearchCallback(prefix, array) + ) + ); + return results; + }, + + /** + * search for full-text terms in the index + */ + performTermsSearch: (searchTerms, excludedTerms) => { + // prepare search + const terms = Search._index.terms; + const titleTerms = Search._index.titleterms; + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + + const scoreMap = new Map(); + const fileMap = new Map(); + + // perform the search on the required terms + searchTerms.forEach((word) => { + const files = []; + const arr = [ + { files: terms[word], score: Scorer.term }, + { files: titleTerms[word], score: Scorer.title }, + ]; + // add support for partial matches + if (word.length > 2) { + const escapedWord = _escapeRegExp(word); + if (!terms.hasOwnProperty(word)) { + Object.keys(terms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: terms[term], score: Scorer.partialTerm }); + }); + } + if (!titleTerms.hasOwnProperty(word)) { + Object.keys(titleTerms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: titleTerms[term], score: Scorer.partialTitle }); + }); + } + } + + // no match but word was a required one + if (arr.every((record) => record.files === undefined)) return; + + // found search word in contents + arr.forEach((record) => { + if (record.files === undefined) return; + + let recordFiles = record.files; + if (recordFiles.length === undefined) recordFiles = [recordFiles]; + files.push(...recordFiles); + + // set score for the word in each file + recordFiles.forEach((file) => { + if (!scoreMap.has(file)) scoreMap.set(file, {}); + scoreMap.get(file)[word] = record.score; + }); + }); + + // create the mapping + files.forEach((file) => { + if (!fileMap.has(file)) fileMap.set(file, [word]); + else if (fileMap.get(file).indexOf(word) === -1) fileMap.get(file).push(word); + }); + }); + + // now check if the files don't contain excluded terms + const results = []; + for (const [file, wordList] of fileMap) { + // check if all requirements are matched + + // as search terms with length < 3 are discarded + const filteredTermCount = [...searchTerms].filter( + (term) => term.length > 2 + ).length; + if ( + wordList.length !== searchTerms.size && + wordList.length !== filteredTermCount + ) + continue; + + // ensure that none of the excluded terms is in the search result + if ( + [...excludedTerms].some( + (term) => + terms[term] === file || + titleTerms[term] === file || + (terms[term] || []).includes(file) || + (titleTerms[term] || []).includes(file) + ) + ) + break; + + // select one (max) score for the file. + const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w])); + // add result to the result list + results.push([ + docNames[file], + titles[file], + "", + null, + score, + filenames[file], + ]); + } + return results; + }, + + /** + * helper function to return a node containing the + * search summary for a given text. keywords is a list + * of stemmed words. + */ + makeSearchSummary: (htmlText, keywords, anchor) => { + const text = Search.htmlToText(htmlText, anchor); + if (text === "") return null; + + const textLower = text.toLowerCase(); + const actualStartPosition = [...keywords] + .map((k) => textLower.indexOf(k.toLowerCase())) + .filter((i) => i > -1) + .slice(-1)[0]; + const startWithContext = Math.max(actualStartPosition - 120, 0); + + const top = startWithContext === 0 ? "" : "..."; + const tail = startWithContext + 240 < text.length ? "..." : ""; + + let summary = document.createElement("p"); + summary.classList.add("context"); + summary.textContent = top + text.substr(startWithContext, 240).trim() + tail; + + return summary; + }, +}; + +_ready(Search.init); diff --git a/_static/sidebar.js b/_static/sidebar.js new file mode 100644 index 000000000..f28c20689 --- /dev/null +++ b/_static/sidebar.js @@ -0,0 +1,70 @@ +/* + * sidebar.js + * ~~~~~~~~~~ + * + * This script makes the Sphinx sidebar collapsible. + * + * .sphinxsidebar contains .sphinxsidebarwrapper. This script adds + * in .sphixsidebar, after .sphinxsidebarwrapper, the #sidebarbutton + * used to collapse and expand the sidebar. + * + * When the sidebar is collapsed the .sphinxsidebarwrapper is hidden + * and the width of the sidebar and the margin-left of the document + * are decreased. When the sidebar is expanded the opposite happens. + * This script saves a per-browser/per-session cookie used to + * remember the position of the sidebar among the pages. + * Once the browser is closed the cookie is deleted and the position + * reset to the default (expanded). + * + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +const initialiseSidebar = () => { + + + + + // global elements used by the functions. + const bodyWrapper = document.getElementsByClassName("bodywrapper")[0] + const sidebar = document.getElementsByClassName("sphinxsidebar")[0] + const sidebarWrapper = document.getElementsByClassName('sphinxsidebarwrapper')[0] + const sidebarButton = document.getElementById("sidebarbutton") + const sidebarArrow = sidebarButton.querySelector('span') + + // for some reason, the document has no sidebar; do not run into errors + if (typeof sidebar === "undefined") return; + + const flipArrow = element => element.innerText = (element.innerText === "»") ? "«" : "»" + + const collapse_sidebar = () => { + bodyWrapper.style.marginLeft = ".8em"; + sidebar.style.width = ".8em" + sidebarWrapper.style.display = "none" + flipArrow(sidebarArrow) + sidebarButton.title = _('Expand sidebar') + window.localStorage.setItem("sidebar", "collapsed") + } + + const expand_sidebar = () => { + bodyWrapper.style.marginLeft = "" + sidebar.style.removeProperty("width") + sidebarWrapper.style.display = "" + flipArrow(sidebarArrow) + sidebarButton.title = _('Collapse sidebar') + window.localStorage.setItem("sidebar", "expanded") + } + + sidebarButton.addEventListener("click", () => { + (sidebarWrapper.style.display === "none") ? expand_sidebar() : collapse_sidebar() + }) + + if (!window.localStorage.getItem("sidebar")) return + const value = window.localStorage.getItem("sidebar") + if (value === "collapsed") collapse_sidebar(); + else if (value === "expanded") expand_sidebar(); +} + +if (document.readyState !== "loading") initialiseSidebar() +else document.addEventListener("DOMContentLoaded", initialiseSidebar) \ No newline at end of file diff --git a/_static/sphinx_highlight.js b/_static/sphinx_highlight.js new file mode 100644 index 000000000..8a96c69a1 --- /dev/null +++ b/_static/sphinx_highlight.js @@ -0,0 +1,154 @@ +/* Highlighting utilities for Sphinx HTML documentation. */ +"use strict"; + +const SPHINX_HIGHLIGHT_ENABLED = true + +/** + * highlight a given string on a node by wrapping it in + * span elements with the given class name. + */ +const _highlight = (node, addItems, text, className) => { + if (node.nodeType === Node.TEXT_NODE) { + const val = node.nodeValue; + const parent = node.parentNode; + const pos = val.toLowerCase().indexOf(text); + if ( + pos >= 0 && + !parent.classList.contains(className) && + !parent.classList.contains("nohighlight") + ) { + let span; + + const closestNode = parent.closest("body, svg, foreignObject"); + const isInSVG = closestNode && closestNode.matches("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.classList.add(className); + } + + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + const rest = document.createTextNode(val.substr(pos + text.length)); + parent.insertBefore( + span, + parent.insertBefore( + rest, + node.nextSibling + ) + ); + node.nodeValue = val.substr(0, pos); + /* There may be more occurrences of search term in this node. So call this + * function recursively on the remaining fragment. + */ + _highlight(rest, addItems, text, className); + + if (isInSVG) { + const rect = document.createElementNS( + "http://www.w3.org/2000/svg", + "rect" + ); + const bbox = parent.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute("class", className); + addItems.push({ parent: parent, target: rect }); + } + } + } else if (node.matches && !node.matches("button, select, textarea")) { + node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); + } +}; +const _highlightText = (thisNode, text, className) => { + let addItems = []; + _highlight(thisNode, addItems, text, className); + addItems.forEach((obj) => + obj.parent.insertAdjacentElement("beforebegin", obj.target) + ); +}; + +/** + * Small JavaScript module for the documentation. + */ +const SphinxHighlight = { + + /** + * highlight the search words provided in localstorage in the text + */ + highlightSearchWords: () => { + if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight + + // get and clear terms from localstorage + const url = new URL(window.location); + const highlight = + localStorage.getItem("sphinx_highlight_terms") + || url.searchParams.get("highlight") + || ""; + localStorage.removeItem("sphinx_highlight_terms") + url.searchParams.delete("highlight"); + window.history.replaceState({}, "", url); + + // get individual terms from highlight string + const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); + if (terms.length === 0) return; // nothing to do + + // There should never be more than one element matching "div.body" + const divBody = document.querySelectorAll("div.body"); + const body = divBody.length ? divBody[0] : document.querySelector("body"); + window.setTimeout(() => { + terms.forEach((term) => _highlightText(body, term, "highlighted")); + }, 10); + + const searchBox = document.getElementById("searchbox"); + if (searchBox === null) return; + searchBox.appendChild( + document + .createRange() + .createContextualFragment( + '" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/architecture.html b/architecture.html new file mode 100644 index 000000000..e31c943f5 --- /dev/null +++ b/architecture.html @@ -0,0 +1,184 @@ + + + + + + + + PyOP2 Architecture — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

PyOP2 Architecture

+

As described in PyOP2 Concepts, PyOP2 exposes an API that allows users to +declare the topology of unstructured meshes in the form of Sets and Maps and data in the form of +Dats, Mats, Globals and Consts. Computations on this data +are described by Kernels described in PyOP2 Kernels +and executed by parallel loops.

+

The API is the frontend to the PyOP2 runtime compilation architecture, which +supports the generation and just-in-time (JIT) compilation of low-level code +for a range of backends described in PyOP2 Backends and the efficient +scheduling of parallel computations. A schematic overview of the PyOP2 +architecture is given below:

+
+_images/pyop2_architecture.svg
+

Schematic overview of the PyOP2 architecture

+
+
+

From an outside perspective, PyOP2 is a conventional Python library, with +performance critical library functions implemented in Cython. A user’s +application code makes calls to the PyOP2 API, most of which are conventional +library calls. The exception are par_loop() calls, which +encapsulate PyOP2’s runtime core functionality performing backend-specific +code generation. Executing a parallel loop comprises the following steps:

+
    +
  1. Compute a parallel execution plan, including information for efficient +staging of data and partitioning and colouring of the iteration set for +conflict-free parallel execution. This process is described in Parallel Execution Plan +and does not apply to the sequential backend.

  2. +
  3. Generate backend-specific code for executing the computation for a given +set of par_loop() arguments as detailed in PyOP2 Backends +according to the execution plan computed in the previous step.

  4. +
  5. Pass the generated code to a backend-specific toolchain for just-in-time +compilation, producing a shared library callable as a Python module which +is dynamically loaded. This module is cached on disk to save recompilation +when the same par_loop() is called again for the same backend.

  6. +
  7. Build the backend-specific list of arguments to be passed to the generated +code, which may initiate host to device data transfer for the CUDA and +OpenCL backends.

  8. +
  9. Call into the generated module to perform the actual computation. For +distributed parallel computations this involves separate calls for the +regions owned by the current processor and the halo as described in +MPI.

  10. +
  11. Perform any necessary reductions for Globals.

  12. +
  13. Call the backend-specific matrix assembly procedure on any +Mat arguments.

  14. +
+
+

Multiple Backend Support

+

The backend is selected by passing the keyword argument backend to the +init() function. If omitted, the sequential backend is +selected by default. This choice can be overridden by exporting the +environment variable PYOP2_BACKEND, which allows switching backends +without having to touch the code. Once chosen, the backend cannot be changed +for the duration of the running Python interpreter session.

+

PyOP2 provides a single API to the user, regardless of which backend the +computations are running on. All classes and functions that form the public +API defined in pyop2.op2 are interfaces, whose concrete implementations +are initialised according to the chosen backend. A metaclass takes care of +instantiating a backend-specific version of the requested class and setting +the corresponding docstrings such that this process is entirely transparent to +the user. The implementation of the PyOP2 backends is completely orthogonal to +the backend selection process and free to use established practices of +object-oriented design.

+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/backends.html b/backends.html new file mode 100644 index 000000000..da944ffe9 --- /dev/null +++ b/backends.html @@ -0,0 +1,560 @@ + + + + + + + + PyOP2 Backends — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

PyOP2 Backends

+

PyOP2 provides a number of different backends to be able to run parallel +computations on different hardware architectures. The currently supported +backends are

+
    +
  • sequential: runs sequentially on a single CPU core.

  • +
  • openmp: runs multiple threads on an SMP CPU using OpenMP. The number of +threads is set with the environment variable OMP_NUM_THREADS.

  • +
  • cuda: offloads computation to a NVIDA GPU (requires CUDA and pycuda)

  • +
  • opencl: offloads computation to an OpenCL device, either a multi-core +CPU or a GPU (requires OpenCL and pyopencl)

  • +
+

Distributed parallel computations using MPI are supported by PyOP2 and +described in detail in MPI. Datastructures must be partitioned among +MPI processes with overlapping regions, so called halos. The host backends +sequential and openmp have full MPI support, the device backends +cuda and opencl only support parallel loops on Dats. Hybrid parallel computations with OpenMP are possible, where +OMP_NUM_THREADS threads are launched per MPI rank.

+
+

Host backends

+

Any computation in PyOP2 requires the generation of code at runtime specific +to each individual par_loop(). The host backends generate code +which is just-in-time (JIT) compiled into a shared library callable +via ctypes. The compilation procedure also takes care of +caching the compiled library on disk, such that the compilation cost +is not paid every time.

+
+

Sequential backend

+

Since there is no parallel computation for the sequential backend, the +generated code is a C wrapper function with a for loop calling the kernel +for the respective par_loop(). This wrapper also takes care of +staging in and out the data as requested by the access descriptors requested +in the parallel loop. Both the kernel and the wrapper function are +just-in-time compiled in a single compilation unit such that the kernel call +can be inlined and does not incur any function call overhead.

+

Recall the par_loop() calling the midpoint kernel from +PyOP2 Kernels:

+
op2.par_loop(midpoint, cells,
+             midpoints(op2.WRITE),
+             coordinates(op2.READ, cell2vertex))
+
+
+

The JIT compiled code for this loop is the kernel followed by the generated +wrapper code:

+
 1inline void midpoint(double p[2], double *coords[2]) {
+ 2  p[0] = (coords[0][0] + coords[1][0] + coords[2][0]) / 3.0;
+ 3  p[1] = (coords[0][1] + coords[1][1] + coords[2][1]) / 3.0;
+ 4}
+ 5
+ 6void wrap_midpoint__(PyObject *_start, PyObject *_end,
+ 7                     PyObject *_arg0_0,
+ 8                     PyObject *_arg1_0, PyObject *_arg1_0_map0_0) {
+ 9  int start = (int)PyInt_AsLong(_start);
+10  int end = (int)PyInt_AsLong(_end);
+11  double *arg0_0 = (double *)(((PyArrayObject *)_arg0_0)->data);
+12  double *arg1_0 = (double *)(((PyArrayObject *)_arg1_0)->data);
+13  int *arg1_0_map0_0 = (int *)(((PyArrayObject *)_arg1_0_map0_0)->data);
+14  double *arg1_0_vec[3];
+15  for ( int n = start; n < end; n++ ) {
+16    int i = n;
+17    arg1_0_vec[0] = arg1_0 + arg1_0_map0_0[i * 3 + 0] * 2;
+18    arg1_0_vec[1] = arg1_0 + arg1_0_map0_0[i * 3 + 1] * 2;
+19    arg1_0_vec[2] = arg1_0 + arg1_0_map0_0[i * 3 + 2] * 2;
+20    midpoint(arg0_0 + i * 2, arg1_0_vec);
+21  }
+22}
+
+
+

Note that the wrapper function is called directly from Python and therefore +all arguments are plain Python objects, which first need to be unwrapped. The +arguments _start and _end define the iteration set indices to iterate +over. The remaining arguments are arrays +corresponding to a Dat or Map passed to the +par_loop(). Arguments are consecutively numbered to avoid name +clashes.

+

The first par_loop() argument midpoints is direct and +therefore no corresponding Map is passed to the wrapper +function and the data pointer is passed straight to the kernel with an +appropriate offset. The second argument coordinates is indirect and hence +a Dat-Map pair is passed. Pointers to the data +are gathered via the Map of arity 3 and staged in the array +arg1_0_vec, which is passed to the kernel. The coordinate data can +therefore be accessed in the kernel via double indirection with the +Map already applied. Note that for both arguments, the +pointers are to two consecutive double values, since the +DataSet is of dimension two in either case.

+
+
+

OpenMP backend

+

In contrast to the sequential backend, the outermost for loop in the +OpenMP backend is annotated with OpenMP pragmas to execute in parallel with +multiple threads. To avoid race conditions on data access, the iteration set +is coloured and a thread safe execution plan is computed as described in +Colouring.

+

The JIT compiled code for the parallel loop from above changes as follows:

+
 1void wrap_midpoint__(PyObject* _boffset,
+ 2                     PyObject* _nblocks,
+ 3                     PyObject* _blkmap,
+ 4                     PyObject* _offset,
+ 5                     PyObject* _nelems,
+ 6                     PyObject *_arg0_0,
+ 7                     PyObject *_arg1_0, PyObject *_arg1_0_map0_0) {
+ 8  int boffset = (int)PyInt_AsLong(_boffset);
+ 9  int nblocks = (int)PyInt_AsLong(_nblocks);
+10  int* blkmap = (int *)(((PyArrayObject *)_blkmap)->data);
+11  int* offset = (int *)(((PyArrayObject *)_offset)->data);
+12  int* nelems = (int *)(((PyArrayObject *)_nelems)->data);
+13  double *arg0_0 = (double *)(((PyArrayObject *)_arg0_0)->data);
+14  double *arg1_0 = (double *)(((PyArrayObject *)_arg1_0)->data);
+15  int *arg1_0_map0_0 = (int *)(((PyArrayObject *)_arg1_0_map0_0)->data);
+16  double *arg1_0_vec[32][3];
+17  #ifdef _OPENMP
+18  int nthread = omp_get_max_threads();
+19  #else
+20  int nthread = 1;
+21  #endif
+22  #pragma omp parallel shared(boffset, nblocks, nelems, blkmap)
+23  {
+24    int tid = omp_get_thread_num();
+25    #pragma omp for schedule(static)
+26    for (int __b = boffset; __b < boffset + nblocks; __b++)
+27    {
+28      int bid = blkmap[__b];
+29      int nelem = nelems[bid];
+30      int efirst = offset[bid];
+31      for (int n = efirst; n < efirst+ nelem; n++ )
+32      {
+33        int i = n;
+34        arg1_0_vec[tid][0] = arg1_0 + arg1_0_map0_0[i * 3 + 0] * 2;
+35        arg1_0_vec[tid][1] = arg1_0 + arg1_0_map0_0[i * 3 + 1] * 2;
+36        arg1_0_vec[tid][2] = arg1_0 + arg1_0_map0_0[i * 3 + 2] * 2;
+37        midpoint(arg0_0 + i * 2, arg1_0_vec[tid]);
+38      }
+39    }
+40  }
+41}
+
+
+

Computation is split into nblocks blocks which start at an initial offset +boffset and correspond to colours that can be executed conflict free in +parallel. This loop over colours is therefore wrapped in an OpenMP parallel +region and is annotated with an omp for pragma. The block id bid for +each of these blocks is given by the block map blkmap and is the index +into the arrays nelems and offset provided as part of the execution +plan. These are the number of elements that are part of the given block and +its starting index. Note that each thread needs its own staging array +arg1_0_vec, which is therefore scoped by the thread id.

+
+
+
+

Device backends

+

As with the host backends, the device backends have most of the implementation +in common. The PyOP2 data carriers Dat, Global +and Const have a data array in host memory and a separate +array in device memory. Flags indicate the present state of a given data +carrier:

+
    +
  • DEVICE_UNALLOCATED: no data is allocated on the device

  • +
  • HOST_UNALLOCATED: no data is allocated on the host

  • +
  • DEVICE: data is up-to-date (valid) on the device, but invalid on the +host

  • +
  • HOST: data is up-to-date (valid) on the host, but invalid on the device

  • +
  • BOTH: data is up-to-date (valid) on both the host and device

  • +
+

When a par_loop() is called, PyOP2 uses the +Access descriptors to determine which data needs to be allocated or +transferred from host to device prior to launching the kernel. Data is only +transferred if it is out of date at the target location and all data transfer +is triggered lazily i.e. the actual copy only occurs once the data is +requested. In particular there is no automatic transfer back of data from +device to host unless it is accessed on the host.

+

A newly created device Dat has no associated device data and +starts out in the state DEVICE_UNALLOCATED. The diagram below shows all +actions that involve a state transition, which can be divided into three +groups: calling explicit data transfer functions (red), access data on the +host (black) and using the Dat in a par_loop() +(blue). There is no need for users to explicitly initiate data transfers and +the tranfer functions are only given for completeness.

+
+_images/pyop2_device_data_state.svg
+

State transitions of a data carrier on PyOP2 device backends

+
+
+

When a device Dat is used in a par_loop() for the +first time, data is allocated on the device. If the Dat is +only read, the host array is transferred to device if it was in state HOST +or DEVICE_UNALLOCATED before the par_loop() and the +Dat is in the state BOTH afterwards, unless it was in +state DEVICE in which case it remains in that state. If the +Dat is written to, data transfer before the +par_loop() is necessary unless the access descriptor is +WRITE and the host data is out of date afterwards and the +Dat is in the state DEVICE. An overview of the state +transitions and necessary memory allocations and data transfers for the two +cases is given in the table below:

+ + + + + + + + + + + + + + + + + + + + + + + + + +

Initial state

par_loop() read

par_loop() written to

DEVICE_UNALLOCATED

BOTH (alloc, transfer h2d)

DEVICE (alloc, transfer h2d unless write-only)

DEVICE

DEVICE

DEVICE

HOST

BOTH (transfer h2d)

DEVICE (transfer h2d unless write-only)

BOTH

BOTH

DEVICE

+

Accessing data on the host initiates a device to host data transfer if the +Dat is in state DEVICE and leaves it in state HOST +when using the data() property and BOTH when using +data_ro().

+

The state transitions described above apply in the same way to a +Global. A Const is read-only, never modified +on device and therefore never out of date on the host. Hence there is no +state DEVICE and it is not necessary to copy back Const +data from device to host.

+
+

CUDA backend

+

The CUDA backend makes extensive use of PyCUDA and its infrastructure for +just-in-time compilation of CUDA kernels and interfacing them to Python. +Linear solvers and sparse matrix data structures are implemented on top of the +CUSP library and are described in greater detail in PyOP2 Linear Algebra Interface. +Code generation uses a template based approach, where a __global__ stub +routine to be called from the host is generated, which takes care of data +marshalling and calling the user kernel as an inline __device__ function.

+

We consider the same midpoint kernel as in the previous examples, which +requires no CUDA-specific modifications and is automatically annotated with a +__device__ qualifier. PyCUDA automatically generates a host stub for the +generated kernel stub __midpoint_stub given a list of parameter types. It +takes care of translating Python objects to plain C data types and pointers, +such that a CUDA kernel can be launched straight from Python. The entire CUDA +code PyOP2 generates is as follows:

+
 1__device__ void midpoint(double p[2], double *coords[2])
+ 2{
+ 3  p[0] = ((coords[0][0] + coords[1][0]) + coords[2][0]) / 3.0;
+ 4  p[1] = ((coords[0][1] + coords[1][1]) + coords[2][1]) / 3.0;
+ 5}
+ 6
+ 7__global__ void __midpoint_stub(int set_size, int set_offset,
+ 8    double *arg0,
+ 9    double *ind_arg1,
+10    int *ind_map,
+11    short *loc_map,
+12    int *ind_sizes,
+13    int *ind_offs,
+14    int block_offset,
+15    int *blkmap,
+16    int *offset,
+17    int *nelems,
+18    int *nthrcol,
+19    int *thrcol,
+20    int nblocks) {
+21  extern __shared__ char shared[];
+22  __shared__ int *ind_arg1_map;
+23  __shared__ int ind_arg1_size;
+24  __shared__ double * ind_arg1_shared;
+25  __shared__ int nelem, offset_b, offset_b_abs;
+26
+27  double *ind_arg1_vec[3];
+28
+29  if (blockIdx.x + blockIdx.y * gridDim.x >= nblocks) return;
+30  if (threadIdx.x == 0) {
+31    int blockId = blkmap[blockIdx.x + blockIdx.y * gridDim.x + block_offset];
+32    nelem = nelems[blockId];
+33    offset_b_abs = offset[blockId];
+34    offset_b = offset_b_abs - set_offset;
+35
+36    ind_arg1_size = ind_sizes[0 + blockId * 1];
+37    ind_arg1_map = &ind_map[0 * set_size] + ind_offs[0 + blockId * 1];
+38
+39    int nbytes = 0;
+40    ind_arg1_shared = (double *) &shared[nbytes];
+41  }
+42
+43  __syncthreads();
+44
+45  // Copy into shared memory
+46  for ( int idx = threadIdx.x; idx < ind_arg1_size * 2; idx += blockDim.x ) {
+47    ind_arg1_shared[idx] = ind_arg1[idx % 2 + ind_arg1_map[idx / 2] * 2];
+48  }
+49
+50  __syncthreads();
+51
+52  // process set elements
+53  for ( int idx = threadIdx.x; idx < nelem; idx += blockDim.x ) {
+54    ind_arg1_vec[0] = ind_arg1_shared + loc_map[0*set_size + idx + offset_b]*2;
+55    ind_arg1_vec[1] = ind_arg1_shared + loc_map[1*set_size + idx + offset_b]*2;
+56    ind_arg1_vec[2] = ind_arg1_shared + loc_map[2*set_size + idx + offset_b]*2;
+57
+58    midpoint(arg0 + 2 * (idx + offset_b_abs), ind_arg1_vec);
+59  }
+60}
+
+
+

The CUDA kernel __midpoint_stub is launched on the GPU for a specific +number of threads in parallel. Each thread is identified inside the kernel by +its thread id threadIdx within a block of threads identified by a two +dimensional block id blockIdx within a grid of blocks.

+

As for OpenMP, there is the potential for data races, which are prevented by +colouring the iteration set and computing a parallel execution plan, where all +elements of the same colour can be modified simultaneously. Each colour is +computed by a block of threads in parallel. All threads of a thread block have +access to a shared memory, which is used as a shared staging area initialised +by thread 0 of each block, see lines 30-41 above. A call to +__syncthreads() ensures these initial values are visible to all threads of +the block. After this barrier, all threads cooperatively gather data from the +indirectly accessed Dat via the Map, followed +by another synchronisation. Following that, each thread loops over the +elements in the partition with an increment of the block size. In each +iteration a thread-private array of pointers to coordinate data in shared +memory is built which is then passed to the midpoint kernel. As for other +backends, the first, directly accessed, argument, is passed as a pointer to +global device memory with a suitable offset.

+
+
+

OpenCL backend

+

The other device backend OpenCL is structurally very similar to the CUDA +backend. It uses PyOpenCL to interface to the OpenCL drivers and runtime. +Linear algebra operations are handled by PETSc as described in +PyOP2 Linear Algebra Interface. PyOP2 generates a kernel stub from a template similar +to the CUDA case.

+

Consider the midpoint kernel from previous examples, whose parameters in +the kernel signature are automatically annotated with OpenCL storage +qualifiers. PyOpenCL provides Python wrappers for OpenCL runtime functions to +build a kernel from a code string, set its arguments and enqueue the kernel +for execution. It takes care of the necessary conversion from Python objects +to plain C data types. PyOP2 generates the following code for the midpoint +example:

+
 1#define ROUND_UP(bytes) (((bytes) + 15) & ~15)
+ 2
+ 3void midpoint(__global double p[2], __local double *coords[2]);
+ 4void midpoint(__global double p[2], __local double *coords[2])
+ 5{
+ 6  p[0] = ((coords[0][0] + coords[1][0]) + coords[2][0]) / 3.0;
+ 7  p[1] = ((coords[0][1] + coords[1][1]) + coords[2][1]) / 3.0;
+ 8}
+ 9
+10__kernel __attribute__((reqd_work_group_size(668, 1, 1)))
+11void __midpoint_stub(
+12    __global double* arg0,
+13    __global double* ind_arg1,
+14    int set_size,
+15    int set_offset,
+16    __global int* p_ind_map,
+17    __global short *p_loc_map,
+18    __global int* p_ind_sizes,
+19    __global int* p_ind_offsets,
+20    __global int* p_blk_map,
+21    __global int* p_offset,
+22    __global int* p_nelems,
+23    __global int* p_nthrcol,
+24    __global int* p_thrcol,
+25    __private int block_offset) {
+26  __local char shared [64] __attribute__((aligned(sizeof(long))));
+27  __local int offset_b;
+28  __local int offset_b_abs;
+29  __local int active_threads_count;
+30
+31  int nbytes;
+32  int block_id;
+33
+34  int i_1;
+35  // shared indirection mappings
+36  __global int* __local ind_arg1_map;
+37  __local int ind_arg1_size;
+38  __local double* __local ind_arg1_shared;
+39  __local double* ind_arg1_vec[3];
+40
+41  if (get_local_id(0) == 0) {
+42    block_id = p_blk_map[get_group_id(0) + block_offset];
+43    active_threads_count = p_nelems[block_id];
+44    offset_b_abs = p_offset[block_id];
+45    offset_b = offset_b_abs - set_offset;ind_arg1_size = p_ind_sizes[0 + block_id * 1];
+46    ind_arg1_map = &p_ind_map[0 * set_size] + p_ind_offsets[0 + block_id * 1];
+47
+48    nbytes = 0;
+49    ind_arg1_shared = (__local double*) (&shared[nbytes]);
+50    nbytes += ROUND_UP(ind_arg1_size * 2 * sizeof(double));
+51  }
+52  barrier(CLK_LOCAL_MEM_FENCE);
+53
+54  // staging in of indirect dats
+55  for (i_1 = get_local_id(0); i_1 < ind_arg1_size * 2; i_1 += get_local_size(0)) {
+56    ind_arg1_shared[i_1] = ind_arg1[i_1 % 2 + ind_arg1_map[i_1 / 2] * 2];
+57  }
+58  barrier(CLK_LOCAL_MEM_FENCE);
+59
+60  for (i_1 = get_local_id(0); i_1 < active_threads_count; i_1 += get_local_size(0)) {
+61    ind_arg1_vec[0] = ind_arg1_shared + p_loc_map[i_1 + 0*set_size + offset_b] * 2;
+62    ind_arg1_vec[1] = ind_arg1_shared + p_loc_map[i_1 + 1*set_size + offset_b] * 2;
+63    ind_arg1_vec[2] = ind_arg1_shared + p_loc_map[i_1 + 2*set_size + offset_b] * 2;
+64
+65    midpoint((__global double* __private)(arg0 + (i_1 + offset_b_abs) * 2), ind_arg1_vec);
+66  }
+67}
+
+
+

Parallel computations in OpenCL are executed by work items organised into +work groups. OpenCL requires the annotation of all pointer arguments with +the memory region they point to: __global memory is visible to any work +item, __local memory to any work item within the same work group and +__private memory is private to a work item. PyOP2 does this annotation +automatically for the user kernel if the OpenCL backend is used. Local memory +therefore corresponds to CUDA’s shared memory and private memory is called +local memory in CUDA. The work item id within the work group is accessed via +the OpenCL runtime call get_local_id(0), the work group id via +get_group_id(0). A barrier synchronisation across all work items of a work +group is enforced with a call to barrier(CLK_LOCAL_MEM_FENCE). Bearing +these differences in mind, the OpenCL kernel stub is structurally almost +identical to the corresponding CUDA version above.

+

The required local memory size per work group reqd_work_group_size is +computed as part of the execution plan. In CUDA this value is a launch +parameter to the kernel, whereas in OpenCL it needs to be hard coded as a +kernel attribute.

+
+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/caching.html b/caching.html new file mode 100644 index 000000000..4105ce9e2 --- /dev/null +++ b/caching.html @@ -0,0 +1,221 @@ + + + + + + + + Caching in PyOP2 — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

Caching in PyOP2

+

PyOP2 makes heavy use of caches to ensure performance is not adversely +affected by too many runtime computations. The caching in PyOP2 takes +a number of forms:

+
    +
  1. Disk-based caching of generated code

    +

    Since compiling a generated code module may be an expensive +operation, PyOP2 caches the generated code on disk such that +subsequent runs of the same simulation will not have to pay a +compilation cost.

    +
  2. +
  3. In memory caching of generated code function pointers

    +

    Once code has been generated and loaded into the running PyOP2 +process, we cache the resulting callable function pointer for the +lifetime of the process, such that subsequent calls to the same +generated code are fast.

    +
  4. +
  5. In memory caching of expensive to build objects

    +

    Some PyOP2 objects, in particular Sparsity objects, +can be expensive to construct. Since a sparsity does not change if +it is built again with the same arguments, we only construct the +sparsity once for each unique set of arguments.

    +
  6. +
+

The caching strategies for PyOP2 follow from two axioms:

+
    +
  1. For PyOP2 Sets and Maps, equality +is identity

  2. +
  3. Caches of generated code should depend on metadata, but not data

  4. +
+

The first axiom implies that two Sets or +Maps compare equal if and only if they are the same +object. The second implies that generated code must be independent +of the absolute size of the data the par_loop() that +generated it executed over. For example, the size of the iteration +set should not be part of the key, but the arity of any maps and size +and type of every data item should be.

+

On consequence of these rules is that there are effectively two +separate types of cache in PyOP2, object and class caches, +distinguished by where the cache itself lives.

+
+

Class caches

+

These are used to cache objects that depend on metadata, but not +object instances, such are generated code. They are implemented by +the cacheable class inheriting from Cached.

+
+

Note

+

There is currently no eviction strategy for class caches, should +they grow too large, for example by executing many different parallel +loops, an out of memory error can occur

+
+
+
+

Object caches

+

These are used to cache objects that are built on top of +Sets and Maps. They are implemented by the +cacheable class inheriting from ObjectCached and the +caching instance defining a _cache attribute.

+

The motivation for these caches is that cache key for objects such as +sparsities relies on an identical sparsity being built if the +arguments are identical. So that users of the API do not have to +worry too much about carrying around “temporary” objects forever such +that they will hit caches, PyOP2 builds up a hierarchy of caches of +transient objects on top of the immutable sets and maps.

+

So, for example, the user can build and throw away +DataSets as normal in their code. Internally, however, +these instances are cached on the set they are built on top of. Thus, +in the following snippet, we have that ds and ds2 are the same +object:

+
s = op2.Set(1)
+ds = op2.DataSet(s, 10)
+ds2 = op2.DataSet(s, 10)
+assert ds is ds2
+
+
+

The setup of these caches is such that the lifetime of objects in the +cache is tied to the lifetime of both the caching and the cached +object. In the above example, as long as the user program holds a +reference to one of s, ds or ds2 all three objects will +remain live. As soon as all references are lost, all three become +candidates for garbage collection.

+
+

Note

+

The cache eviction strategy for these caches relies on the Python +garbage collector, and hence on the user not holding onto +references to some of either the cached or the caching objects for +too long. Should the objects on which the caches live persist, an +out of memory error may occur.

+
+
+
+

Debugging cache leaks

+

To debug potential problems with the cache, PyOP2 can be instructed to +print the size of both object and class caches at program exit. This +can be done by setting the environment variable +PYOP2_PRINT_CACHE_SIZE to 1 before running a PyOP2 program, or +passing the print_cache_size to init().

+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/concepts.html b/concepts.html new file mode 100644 index 000000000..f7869339e --- /dev/null +++ b/concepts.html @@ -0,0 +1,363 @@ + + + + + + + + PyOP2 Concepts — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

PyOP2 Concepts

+

Many numerical algorithms and scientific computations on unstructured meshes +can be viewed as the independent application of a local operation +everywhere on a mesh. This local operation is often called a computational +kernel and its independent application lends itself naturally to parallel +computation. An unstructured mesh can be described by sets of entities +(vertices, edges, cells) and the connectivity between those sets forming the +topology of the mesh.

+

PyOP2 is a domain-specific language (DSL) for the parallel executions of +computational kernels on unstructured meshes or graphs.

+
+

Sets and mappings

+

A mesh is defined by sets of entities and +mappings between these sets. Sets are used to represent +entities in the mesh (nodes in the graph) or degrees of freedom of data +(fields) living “on” the mesh (graph), while maps define the connectivity +between entities (links in the graph) or degrees of freedom, for example +associating an edge with its incident vertices. Sets of mesh entities may +coincide with sets of degrees of freedom, but this is not necessarily the case +e.g. the set of degrees of freedom for a field may be defined on the vertices +of the mesh and the midpoints of edges connecting the vertices.

+
+

Note

+

There is a requirement for the map to be of constant arity, that is each +element in the source set must be associated with a constant number of +elements in the target set. There is no requirement for the map to be +injective or surjective. This restriction excludes certain kinds of mappings +e.g. a map from vertices to incident egdes or cells is only possible on a +very regular mesh where the multiplicity of any vertex is constant.

+
+

In the following we declare a Set vertices, a +Set edges and a Map edges2vertices +between them, which associates the two incident vertices with each edge:

+
vertices = op2.Set(4)
+edges = op2.Set(3)
+edges2vertices = op2.Map(edges, vertices, 2, [[0, 1], [1, 2], [2, 3]])
+
+
+
+
+

Data

+

PyOP2 distinguishes three kinds of user provided data: data that lives on a +set (often referred to as a field) is represented by a Dat, +data that has no association with a set by a Global and data +that is visible globally and referred to by a unique identifier is declared as +Const. Examples of the use of these data types are given in +the Parallel loops section below.

+
+

Dat

+

Since a set does not have any type but only a cardinality, data declared on a +set through a Dat needs additional metadata to allow PyOP2 to +interpret the data and to specify how much memory is required to store it. This +metadata is the datatype and the shape of the data associated with any +given set element. The shape is not associated with the Dat +directly, but with a DataSet. One can associate a scalar with +each element of the set or a one- or higher-dimensional vector. Similar to the +restriction on maps, the shape and therefore the size of the data associated +which each element needs to be uniform. PyOP2 supports all common primitive +data types supported by NumPy. Custom datatypes are supported insofar as +the user implements the serialisation and deserialisation of that type into +primitive data that can be handled by PyOP2.

+

Declaring coordinate data on the vertices defined above, where two float +coordinates are associated with each vertex, is done like this:

+
dvertices = op2.DataSet(vertices, dim=2)
+coordinates = op2.Dat(dvertices,
+                      [[0.0, 0.0], [0.0, 1.0], [1.0, 1.0], [1.0, 0.0]],
+                      dtype=float)
+
+
+
+
+

Global

+

In contrast to a Dat, a Global has no +association to a set and the shape and type of the data are declared directly +on the Global. A 2x2 elasticity tensor would be defined as +follows:

+
elasticity = op2.Global((2, 2), [[1.0, 0.0], [0.0, 1.0]], dtype=float)
+
+
+
+
+

Const

+

Data that is globally visible and read-only to kernels is declared with a +Const and needs to have a globally unique identifier. It does +not need to be declared as an argument to a par_loop(), but is +accessible in a kernel by name. A globally visible parameter eps would be +declared as follows:

+
eps = op2.Const(1, 1e-14, name="eps", dtype=float)
+
+
+
+
+

Mat

+

In a PyOP2 context, a (sparse) matrix is a linear operator from one set to +another. In other words, it is a linear function which takes a +Dat on one set A and returns the value of a +Dat on another set B. Of course, in particular, +A may be the same set as B. This makes the operation of at +least some matrices equivalent to the operation of a particular PyOP2 kernel.

+

PyOP2 can be used to assemble matrices, which are defined +on a sparsity pattern which is built from a pair of +DataSets defining the row and column spaces the +sparsity maps between and one or more pairs of maps, one for the row and one +for the column space of the matrix respectively. The sparsity uniquely defines +the non-zero structure of the sparse matrix and can be constructed purely from +those mappings. To declare a Mat on a Sparsity +only the data type needs to be given.

+

Since the construction of large sparsity patterns is a very expensive +operation, the decoupling of Mat and Sparsity +allows the reuse of sparsity patterns for a number of matrices without +recomputation. In fact PyOP2 takes care of caching sparsity patterns on behalf +of the user, so declaring a sparsity on the same maps as a previously declared +sparsity yields the cached object instead of building another one.

+

Defining a matrix of floats on a sparsity which spans from the space of +vertices to the space of vertices via the edges is done as follows:

+
sparsity = op2.Sparsity((dvertices, dvertices),
+                        [(edges2vertices, edges2vertices)])
+matrix = op2.Mat(sparsity, float)
+
+
+
+
+
+

Parallel loops

+

Computations in PyOP2 are executed as parallel loops +of a Kernel over an iteration set. Parallel loops are the +core construct of PyOP2 and hide most of its complexity such as parallel +scheduling, partitioning, colouring, data transfer from and to device and +staging of the data into on chip memory. Computations in a parallel loop must +be independent of the order in which they are executed over the set to allow +PyOP2 maximum flexibility to schedule the computation in the most efficient +way. Kernels are described in more detail in PyOP2 Kernels.

+
+

Loop invocations

+

A parallel loop invocation requires as arguments, other than the iteration set +and the kernel to operate on, the data the kernel reads and/or writes. A +parallel loop argument is constructed by calling the underlying data object +(i.e. the Dat or Global) and passing an +access descriptor and the mapping to be used when accessing the data. The +mapping is required for an indirectly accessed Dat not +declared on the same set as the iteration set of the parallel loop. In the +case of directly accessed data defined on the same set as the iteration set +the map is omitted and only an access descriptor given.

+

Consider a parallel loop that translates the coordinate field by a +constant offset given by the Const offset. Note how the +kernel has access to the local variable offset even though it has not been +passed as an argument to the par_loop(). This loop is direct and +the argument coordinates is read and written:

+
op2.Const(2, [1.0, 1.0], dtype=float, name="offset");
+
+translate = op2.Kernel("""void translate(double * coords) {
+  coords[0] += offset[0];
+  coords[1] += offset[1];
+}""", "translate")
+
+op2.par_loop(translate, vertices, coordinates(op2.RW))
+
+
+
+
+

Access descriptors

+

Access descriptors define how the data is accessed by the kernel and give +PyOP2 crucial information as to how the data needs to be treated during +staging in before and staging out after kernel execution. They must be one of +pyop2.READ (read-only), pyop2.WRITE (write-only), +pyop2.RW (read-write), pyop2.INC (increment), +pyop2.MIN (minimum reduction) or pyop2.MAX (maximum +reduction).

+

Not all of these descriptors apply to all PyOP2 data types. A +Dat can have modes READ, WRITE, +RW and INC. For a Global the +valid modes are READ, INC, MIN and +MAX and for a Mat only WRITE and +INC are allowed.

+
+
+

Loops assembling matrices

+

We declare a parallel loop assembling the matrix via a given kernel +which we’ll assume has been defined before over the edges and with +coordinates as input data. The matrix is the output argument of this +parallel loop and therefore has the access descriptor INC since +the assembly accumulates contributions from different vertices via the +edges2vertices mapping. Note that the mappings are being indexed with the +iteration indices op2.i[0] and +op2.i[1] respectively. This means that PyOP2 generates a local +iteration space of size arity * arity with the +arity of the Map edges2vertices for any given element +of the iteration set. This local iteration space is then iterated over using +the iteration indices on the maps. The kernel is assumed to only apply to a +single point in that local iteration space. The coordinates are accessed +via the same mapping, but are a read-only input argument to the kernel and +therefore use the access descriptor READ:

+
op2.par_loop(kernel, edges,
+             matrix(op2.INC, (edges2vertices[op2.i[0]],
+                              edges2vertices[op2.i[1]])),
+             coordinates(op2.READ, edges2vertices))
+
+
+

You can stack up multiple successive parallel loops that add values to +a matrix, before you use the resulting values, you must explicitly +tell PyOP2 that you want to do so, by calling +assemble() on the matrix. Note that executing a +solve() will do this automatically for you.

+
+
+

Loops with global reductions

+

Globals are used primarily for reductions where a +given quantity on a field is reduced to a single number by summation or +finding the minimum or maximum. Consider a kernel computing the L2 norm of +the pressure field defined on the set of vertices as l2norm. Note +that the Dat constructor automatically creates an anonymous +DataSet of dimension 1 if a Set is passed as +the first argument. We assume pressure is the result of some prior +computation and only give the declaration for context.

+
pressure = op2.Dat(vertices, [...], dtype=float)
+l2norm = op2.Global(dim=1, data=[0.0])
+
+norm = op2.Kernel("""void norm(double * out, double * field) {
+  *out += field[0] * field[0];
+}""", "norm")
+
+op2.par_loop(pressure, vertices,
+             l2norm(op2.INC),
+             vertices(op2.READ))
+
+
+
+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/genindex.html b/genindex.html new file mode 100644 index 000000000..86727652d --- /dev/null +++ b/genindex.html @@ -0,0 +1,1150 @@ + + + + + + + Index — PyOP2 2020.0 documentation + + + + + + + + + + + + +
+
+
+
+ + +

Index

+ +
+ A + | C + | D + | E + | F + | G + | H + | I + | K + | L + | M + | N + | O + | P + | R + | S + | T + | U + | V + | W + | Z + +
+

A

+ + + +
+ +

C

+ + + +
+ +

D

+ + + +
+ +

E

+ + + +
+ +

F

+ + + +
+ +

G

+ + + +
+ +

H

+ + + +
+ +

I

+ + + +
+ +

K

+ + +
+ +

L

+ + + +
+ +

M

+ + +
+ +

N

+ + + +
+ +

O

+ + + +
+ +

P

+ + + +
    +
  • + pyop2.types.access + +
  • +
  • + pyop2.types.dat + +
  • +
  • + pyop2.types.data_carrier + +
  • +
  • + pyop2.types.dataset + +
  • +
  • + pyop2.types.glob + +
  • +
  • + pyop2.types.halo + +
  • +
  • + pyop2.types.map + +
  • +
  • + pyop2.types.set + +
  • +
  • + pyop2.utils + +
  • +
+ +

R

+ + + +
+ +

S

+ + + +
+ +

T

+ + + +
+ +

U

+ + + +
+ +

V

+ + + +
+ +

W

+ + +
+ +

Z

+ + +
+ + + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 000000000..10e346c10 --- /dev/null +++ b/index.html @@ -0,0 +1,224 @@ + + + + + + + + Welcome to PyOP2’s documentation! — PyOP2 2020.0 documentation + + + + + + + + + + + + + +
+
+
+
+ +
+

Welcome to PyOP2’s documentation!

+
+

Warning

+

The prose documentation contained here is significantly out-of-date and thus +contains many inaccuracies. It is, nevertheless, quite a useful resource for +people new to PyOP2. Please read with care.

+

The API documentation, however, is updated regularly and can be considered +accurate.

+
+

Contents:

+
+ +
+
+
+

Indices and tables

+ +
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/installation.html b/installation.html new file mode 100644 index 000000000..e9f96cb8e --- /dev/null +++ b/installation.html @@ -0,0 +1,128 @@ + + + + + + + + Installing PyOP2 — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ + build status + + +
+

Installing PyOP2

+

PyOP2 requires Python 3.6 or later.

+

The main testing platform for PyOP2 is Ubuntu 18.04 64-bit with Python +3.6. Later Ubuntu versions should also work. Some users successfully +use PyOP2 on Mac OS X.

+

Installation of the dependencies is somewhat involved, and therefore +the recommended way to obtain PyOP2 is by using the Firedrake +installation script. This will give +you a Python 3 venv that contains a working PyOP2 installation.

+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/ir.html b/ir.html new file mode 100644 index 000000000..2a812c3e8 --- /dev/null +++ b/ir.html @@ -0,0 +1,412 @@ + + + + + + + + The PyOP2 Intermediate Representation — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

The PyOP2 Intermediate Representation

+

The parallel loop is the main construct of PyOP2. +It applies a specific Kernel to all elements in the iteration +set of the parallel loop. Here, we describe how to use the PyOP2 API to build +a kernel and, also, we provide simple guidelines on how to write efficient +kernels.

+
+

Using the Intermediate Representation

+

In the previous section, we described the API for +PyOP2 kernels in terms of the C code that gets executed. +Passing in a string of C code is the simplest way of creating a +Kernel. Another possibility is to use PyOP2 Intermediate +Representation (IR) objects to express the Kernel semantics.

+

An Abstract Syntax Tree of the kernel code can be manually built using IR +objects. Since PyOP2 has been primarily thought to be fed by higher layers +of abstractions, rather than by users, no C-to-AST parser is currently provided. +The advantage of providing an AST, instead of C code, is that it enables PyOP2 +to inspect and transform the kernel, which is aimed at achieving performance +portability among different architectures and, more generally, better execution +times.

+

For the purposes of exposition, let us consider a simple +kernel init which initialises the members of a Dat +to zero.

+
from op2 import Kernel
+
+code = """void init(double* edge_weight) {
+  for (int i = 0; i < 3; i++)
+    edge_weight[i] = 0.0;
+}"""
+kernel = Kernel(code, "init")
+
+
+

Here, we describe how we can use PyOP2 IR objects to build an AST for +the this kernel. For example, the most basic AST one can come up with +is

+
from op2 import Kernel
+from ir.ast_base import *
+
+ast = FlatBlock("""void init(double* edge_weight) {
+  for (int i = 0; i < 3; i++)
+    edge_weight[i] = 0.0;
+}""")
+kernel = Kernel(ast, "init")
+
+
+

The FlatBlock object encapsulates a “flat” block +of code, which is not modified by the IR engine. A +FlatBlock is used to represent (possibly large) +fragments of code for which we are not interested in any kind of +transformation, so it may be particularly useful to speed up code development +when writing, for example, test cases or non-expensive kernels. On the other +hand, time-demanding kernels should be properly represented using a “real” +AST. For example, an useful AST for init could be the following

+
from op2 import Kernel
+from ir.ast_base import *
+
+ast_body = [FlatBlock("...some code can go here..."),
+            c_for("i", 3, Assign(Symbol("edge_weight", ("i",)), c_sym("0.0")))]
+ast = FunDecl("void", "init",
+              [Decl("double*", c_sym("edge_weight"))],
+              ast_body)
+kernel = Kernel(ast, "init")
+
+
+

In this example, we first construct the body of the kernel function. We have +an initial FlatBlock that contains, for instance, +some sort of initialization code. c_for() is a shortcut +for building a for loop. It takes an +iteration variable (i), the extent of the loop and its body. Multiple +statements in the body can be passed in as a list. +c_sym() is a shortcut for building symbols. You may want to use +c_sym() when the symbol makes no explicit use of +iteration variables.

+

We use Symbol instead of +c_sym(), when edge_weight accesses a specific +element using the iteration variable i. This is fundamental to allow the +IR engine to perform many kind of transformations involving the kernel’s +iteration space(s). Finally, the signature of the function is constructed +using the FunDecl.

+

Other examples on how to build ASTs can be found in the tests folder, +particularly looking into test_matrices.py and +test_iteration_space_dats.py.

+
+
+

Achieving Performance Portability with the IR

+

One of the key objectives of PyOP2 is obtaining performance portability. +This means that exactly the same program can be executed on a range of +different platforms, and that the PyOP2 engine will strive to get the best +performance out of the chosen platform. PyOP2 allows users to write kernels +by completely abstracting from the underlying machine. This is mainly +achieved in two steps:

+
    +
  • Given the AST of a kernel, PyOP2 applies a first transformation aimed at +mapping the parallelism inherent to the kernel to that available in the +backend.

  • +
  • Then, PyOP2 applies optimizations to the sequential code, depending on the +underlying backend.

  • +
+

To maximize the outcome of the transformation process, it is important that +kernels are written as simply as possible. That is, premature optimization, +possibly for a specific backend, might harm performance.

+

A minimal language, the so-called PyOP2 Kernel Domain-Specific Language, is +used to trigger specific transformations. If we had had a parser from C +code to AST, we would have embedded this DSL in C by means of pragmas. +As we directly build an AST, we achieve the same goal by decorating AST nodes +with specific attributes, added at node creation-time. An overview of the +language follows

+
    +
  • pragma pyop2 itspace. This is added to For +nodes (i.e. written on top of for loops). It tells PyOP2 that the following +is a fully-parallel loop, that is all of its iterations can be executed in +parallel without any sort of synchronization.

  • +
  • pragma pyop2 assembly(itvar1, itvar2). This is added to a statement node, +to denote that we are performing a local assembly operation along to the +itvar1 and itvar2 dimensions.

  • +
  • pragma pyop2 simd. This is added on top of the kernel signature. It is +used to suggest PyOP2 to apply SIMD vectorization along the ParLoop’s +iteration set dimension. This kind of vectorization is also known as +inter-kernel vectorization. This feature is currently not supported +by PyOP2, and will be added only in a future release.

  • +
+

The itspace pragma tells PyOP2 how to extract parallelism from the kernel. +Consider again our usual example. To expose a parallel iteration space, one +one must write

+
from op2 import Kernel
+
+code = """void init(double* edge_weight) {
+  #pragma pyop2 itspace
+  for (int i = 0; i < 3; i++)
+    edge_weight[i] = 0.0;
+}"""
+kernel = Kernel(code, "init")
+
+
+

The c_for() shortcut when creating an AST expresses +the same semantics of a for loop decorated with a pragma pyop2 itspace.

+

Now, imagine we are executing the init kernel on a CPU architecture. +Typically we want a single core to execute the entire kernel, because it is +very likely that the kernel’s iteration space is small and its working set +fits the L1 cache, and no benefit would be gained by splitting the computation +between distinct cores. On the other end, if the backend is a GPU or an +accelerator, a different execution model might give better performance. +There’s a huge amount of parallelism available, for example, in a GPU, so +delegating the execution of an individual iteration (or a chunk of iterations) +to a single thread could pay off. If that is the case, the PyOP2 IR engine +re-structures the kernel code to exploit such parallelism.

+
+
+

Optimizing kernels on CPUs

+

So far, some effort has been spent on optimizations for CPU platforms. Being a +DSL, PyOP2 provides specific support for those (linear algebra) operations that +are common among unstructured-mesh-based numerical methods. For example, PyOP2 +is capable of aggressively optimizing local assembly codes for applications +based on the Finite Element Method. We therefore distinguish optimizations in +two categories:

+
    +
  • Generic optimizations, such as data alignment and support for autovectorization.

  • +
  • Domain-specific optimizations (DSO)

  • +
+

To trigger DSOs, statements must be decorated using the kernel DSL. For example, +if the kernel computes the local assembly of an element in an unstructured mesh, +then a pragma pyop2 assembly(itvar1, itvar2) should be added on top of the +corresponding statement. When constructing the AST of a kernel, this can be +simply achieved by

+
from ir.ast_base import *
+
+s1 = Symbol("X", ("i",))
+s2 = Symbol("Y", ("j",))
+tensor = Symbol("A", ("i", "j"))
+pragma = "#pragma pyop2 outerproduct(j,k)"
+code = c_for("i", 3, c_for("j", 3, Incr(tensor, Prod(s1, s2), pragma)))
+
+
+

That, conceptually, corresponds to

+
#pragma pyop2 itspace
+for (int i = 0; i < 3; i++)
+  #pragma pyop2 itspace
+  for (int j = 0; j < 3; j++)
+    #pragma pyop2 assembly(i, j)
+    A[i][j] += X[i]*Y[j]
+
+
+

Visiting the AST, PyOP2 finds a 2-dimensional iteration space and an assembly +statement. Currently, #pragma pyop2 itspace is ignored when the backend is +a CPU. The #pragma pyop2 assembly(i, j) can trigger multiple DSOs. +PyOP2 currently lacks an autotuning system that automatically finds out the +best possible kernel implementation; that is, the optimizations that minimize +the kernel run-time. To drive the optimization process, the user (or the +higher layer) can specify which optimizations should be applied. Currently, +PyOP2 can automate:

+
    +
  • Alignment and padding of data structures: for issuing aligned loads and stores.

  • +
  • Loop trip count adjustment according to padding: useful for autovectorization +when the trip count is not a multiple of the vector length

  • +
  • Loop-invariant code motion and autovectorization of invariant code: this is +particularly useful since trip counts are typically small, and hoisted code +can still represent a significant proportion of the execution time

  • +
  • Register tiling for rectangular iteration spaces

  • +
  • (DSO for pragma assembly): Outer-product vectorization + unroll-and-jam of +outer loops to improve register re-use or to mitigate register pressure

  • +
+
+
+

How to select specific kernel optimizations

+

When constructing a Kernel, it is possible to specify the set +of optimizations we want PyOP2 to apply. The IR engine will analyse the kernel +AST and will try to apply, incrementally, such optimizations. The PyOP2’s FFC +interface, which build a Kernel object given an AST provided +by FFC, makes already use of the available optimizations. Here, we take the +emblematic case of the FFC interface and describe how to play with the various +optimizations through a series of examples.

+
ast = ...
+opts = {'licm': False,
+        'tile': None,
+        'ap': False,
+        'vect': None}
+kernel = Kernel(ast, 'my_kernel', opts)
+
+
+

In this example, we have an AST ast and we specify optimizations through +the dictionary opts; then, we build the Kernel, passing in +the optional argument opts. No optimizations are enabled here. The +possible options are:

+
    +
  • licm: Loop-Invariant Code Motion.

  • +
  • tile: Register Tiling (of rectangular iteration spaces)

  • +
  • ap: Data alignment, padding. Trip count adjustment.

  • +
  • vect: SIMD intra-kernel vectorization.

  • +
+

If we wanted to apply both loop-invariant code motion and data alignment, we +would simply write

+
ast = ...
+opts = {'licm': True,
+        'ap': True}
+kernel = Kernel(ast, 'my_kernel', opts)
+
+
+

Now, let’s assume we know the kernel has a rectangular iteration space. We want +to try register tiling, with a particular tile size. The way to get it is

+
ast = ...
+opts = {'tile': (True, 8)}
+kernel = Kernel(ast, 'my_kernel', opts)
+
+
+

In this case, the iteration space is sliced into tiles of size 8x8. If the +iteration space is smaller than the slice, then the transformation is not +applied. By specifying -1 instead of 8, we leave PyOP2 free to choose +automatically a certain tile size.

+

A fundamental optimization for any PyOP2 kernel is SIMD vectorization. This is +because almost always kernels fit the L1 cache and are likely to be compute- +bound. Backend compilers’ AutoVectorization (AV) is therefore an opportunity. +By enforcing data alignment and padding, we can increase the chance AV is +successful. To try AV, one should write

+
import ir.ast_plan as ap
+
+ast = ...
+opts = {'ap': True,
+        'vect': (ap.AUTOVECT, -1)}
+kernel = Kernel(ast, 'my_kernel', opts)
+
+
+

The vect’s second parameter (-1) is ignored when AV is requested. +If our kernel is computing an assembly-like operation, then we can ask PyOP2 +to optimize for register locality and register pressure, by resorting to a +different vectorization technique. Early experiments show that this approach +can be particularly useful when the amount of data movement in the assembly +loops is “significant”. Of course, this depends on kernel parameters (e.g. +size of assembly loop, number and size of arrays involved in the assembly) as +well as on architecture parameters (e.g. size of L1 cache, number of available +registers). This strategy takes the name of Outer-Product Vectorization +(OP), and can be activated in the following way (again, we suggest to use it +along with data alignment and padding).

+
import ir.ast_plan as ap
+
+ast = ...
+opts = {'ap': True,
+        'vect': (ap.V_OP_UAJ, 1)}
+kernel = Kernel(ast, 'my_kernel', opts)
+
+
+

UAJ in V_OP_UAJ stands for Unroll-and-Jam. It has been proved that +OP shows a much better performance when used in combination with unrolling the +outer assembly loop and incorporating (jamming) the unrolled iterations +within the inner loop. The second parameter, therefore, specifies the unroll- +and-jam factor: the higher it is, the larger is the number of iterations +unrolled. A factor 1 means that no unroll-and-jam is performed. The optimal +factor highly depends on the computational characteristics of the kernel.

+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/kernels.html b/kernels.html new file mode 100644 index 000000000..082634616 --- /dev/null +++ b/kernels.html @@ -0,0 +1,326 @@ + + + + + + + + PyOP2 Kernels — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

PyOP2 Kernels

+

Kernels in PyOP2 define the local operations that are to be performed for each +element of the iteration set the kernel is executed over. There must be a one +to one match between the arguments declared in the kernel signature and the +actual arguments passed to the parallel loop executing this kernel. As +described in PyOP2 Concepts, data is accessed directly on the iteration set +or via mappings passed in the par_loop() call.

+

The kernel only sees data corresponding to the current element of the +iteration set it is invoked for. Any data read by the kernel i.e. accessed as +READ, RW or INC is automatically +gathered via the mapping relationship in the staging in phase and the kernel +is passed pointers to the staging memory. Similarly, after the kernel has been +invoked, any modified data i.e. accessed as WRITE, +RW or INC is scattered back out via the +Map in the staging out phase. It is only safe for a kernel +to manipulate data in the way declared via the access descriptor in the +parallel loop call. Any modifications to an argument accessed read-only would +not be written back since the staging out phase is skipped for this argument. +Similarly, the result of reading an argument declared as write-only is +undefined since the data has not been staged in.

+
+

Kernel API

+

Consider a par_loop() computing the midpoint of a triangle given +the three vertex coordinates. Note that we make use of a covenience in the +PyOP2 syntax, which allow declaring an anonymous DataSet of a +dimension greater one by using the ** operator. We omit the actual data in +the declaration of the Map cell2vertex and +Dat coordinates.

+
vertices = op2.Set(num_vertices)
+cells = op2.Set(num_cells)
+
+cell2vertex = op2.Map(cells, vertices, 3, [...])
+
+coordinates = op2.Dat(vertices ** 2, [...], dtype=float)
+midpoints = op2.Dat(cells ** 2, dtype=float)
+
+op2.par_loop(midpoint, cells,
+             midpoints(op2.WRITE),
+             coordinates(op2.READ, cell2vertex))
+
+
+

Kernels are implemented in a restricted subset of C99 and are declared by +passing a C code string and the kernel function name, which must match the +name in the C kernel signature, to the Kernel constructor:

+
midpoint = op2.Kernel("""
+void midpoint(double p[2], double *coords[2]) {
+  p[0] = (coords[0][0] + coords[1][0] + coords[2][0]) / 3.0;
+  p[1] = (coords[0][1] + coords[1][1] + coords[2][1]) / 3.0;
+}""", "midpoint")
+
+
+

Since kernels cannot return any value, the return type is always void. The +kernel argument p corresponds to the third par_loop() +argument midpoints and coords to the fourth argument coordinates +respectively. Argument names need not agree, the matching is by position.

+

Data types of kernel arguments must match the type of data passed to the +parallel loop. The Python types float and numpy.float64 +correspond to a C double, numpy.float32 to a C +float, int or numpy.int64 to a C long and +numpy.int32 to a C int.

+

Direct par_loop() arguments such as midpoints are passed to +the kernel as a double *, indirect arguments such as coordinates as a +double ** with the first indirection due to the map and the second +indirection due the data dimension. The kernel signature above uses arrays +with explicit sizes to draw attention to the fact that these are known. We +could have interchangibly used a kernel signature with plain pointers:

+
void midpoint(double * p, double ** coords)
+
+
+

Argument creation supports an optional flag flatten, which is used +for kernels which expect data to be laid out by component:

+
midpoint = op2.Kernel("""
+void midpoint(double p[2], double *coords[1]) {
+  p[0] = (coords[0][0] + coords[1][0] + coords[2][0]) / 3.0;
+  p[1] = (coords[3][0] + coords[4][0] + coords[5][0]) / 3.0;
+}""", "midpoint")
+
+op2.par_loop(midpoint, cells,
+             midpoints(op2.WRITE),
+             coordinates(op2.READ, cell2vertex, flatten=True))
+
+
+
+
+

Data layout

+

Data for a Dat declared on a Set is +stored contiguously for all elements of the set. For each element, +this is a contiguous chunk of data of a shape given by the +DataSet dim and the datatype of the +Dat. The size of this chunk is the product of the +extents of the dim tuple times the size of the datatype.

+

During execution of the par_loop(), the kernel is called +for each element of the iteration set and passed data for each of its +arguments corresponding to the current set element i only.

+

For a directly accessed argument such as midpoints above, the +kernel is passed a pointer to the beginning of the chunk of data for +the element i the kernel is currently called for. In CUDA/OpenCL +i is the global thread id since the kernel is launched in parallel +for all elements.

+
+_images/direct_arg.svg
+

Data layout for a directly accessed Dat argument with +dim 2

+
+
+

For an indirectly accessed argument such as coordinates above, +PyOP2 gathers pointers to the data via the Map +cell2vertex used for the indirection. The kernel is passed a list +of pointers of length corresponding to the arity of the +Map, in the example above 3. Each of these points to +the data chunk for the element in the target Set given +by Map entries (i, 0), (i, 1) and (i, 2).

+
+_images/indirect_arg.svg
+

Data layout for a Dat argument with dim 2 indirectly +accessed through a Map of arity 3

+
+
+

If the argument is created with the keyword argument flatten set +to True, a flattened vector of pointers is passed to the kernel. +This vector is of length dim * arity (where dim is the product +of the extents of the dim tuple), which is 6 in the example above. +Each entry points to a single data value of the Dat. +The ordering is by component of dim i.e. the first component of +each data item for each element in the target set pointed to by the +map followed by the second component etc.

+
+_images/indirect_arg_flattened.svg
+

Data layout for a flattened Dat argument with dim 2 +indirectly accessed through a Map of arity 3

+
+
+
+
+

Local iteration spaces

+

PyOP2 supports complex kernels with large local working set sizes, which may +not run very efficiently on architectures with a limited amount of registers +and on-chip resources. In many cases the resource usage is proportional to the +size of the local iteration space the kernel operates on.

+

Consider a finite-element local assembly kernel for vector-valued basis +functions of second order on triangles. There are kernels more complex and +computing considerably larger local tensors commonly found in finite-element +computations, in particular for higher-order basis functions, and this kernel +only serves to illustrate the concept. For each element in the iteration set, +this kernel computes a 12x12 local tensor:

+
void kernel(double A[12][12], ...) {
+  ...
+  // loops over the local iteration space
+  for (int j = 0; j < 12; j++) {
+    for (int k = 0; k < 12; k++) {
+      A[j][k] += ...
+    }
+  }
+}
+
+
+

PyOP2 invokes this kernel for each element in the iteration set:

+
for (int ele = 0; ele < nele; ++ele) {
+  double A[12][12];
+  ...
+  kernel(A, ...);
+}
+
+
+

To improve the efficiency of executing complex kernels on manycore +platforms, their operation can be distributed among several threads +which each compute a single point in this local iteration space to +increase the level of parallelism and to lower the amount of resources +required per thread. In the case of the kernel above we obtain:

+
void mass(double A[1][1], ..., int j, int k) {
+  ...
+  A[0][0] += ...
+}
+
+
+

Note how the doubly nested loop over basis function is hoisted out of the +kernel, which receives its position in the local iteration space to compute as +additional arguments j and k. PyOP2 then calls the kernel for +each element of the local iteration space for each set element:

+
for (int ele = 0; ele < nele; ++ele) {
+  double A[1][1];
+  ...
+  for (int j = 0; j < 12; j++) {
+    for (int k = 0; k < 12; k++) {
+      kernel(A, ..., j, k);
+    }
+  }
+}
+
+
+

On manycore platforms, the local iteration space does not translate into a +loop nest, but rather into a larger number of threads being launched to +compute each of its elements:

+
+_images/iteration_spaces.svg
+

Local iteration space for a kernel computing a 12x12 local tensor

+
+
+

PyOP2 needs to be told to loop over this local iteration space by +indexing the corresponding maps with an +IterationIndex i in the +par_loop() call.

+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/linear_algebra.html b/linear_algebra.html new file mode 100644 index 000000000..172c4c47e --- /dev/null +++ b/linear_algebra.html @@ -0,0 +1,387 @@ + + + + + + + + PyOP2 Linear Algebra Interface — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

PyOP2 Linear Algebra Interface

+

PyOP2 supports linear algebra operations on sparse matrices using a thin +wrapper around the PETSc library harnessed via its petsc4py interface.

+

As described in PyOP2 Concepts, a sparse matrix is a linear operator that +maps a DataSet representing its row space to a +DataSet representing its column space and vice versa. These +two spaces are commonly the same, in which case the resulting matrix is +square. A sparse matrix is represented by a Mat, which is +declared on a Sparsity, representing its non-zero structure.

+
+

Sparse Matrix Storage Formats

+

PETSc uses the popular Compressed Sparse Row (CSR) format to only store the +non-zero entries of a sparse matrix. In CSR, a matrix is stored as three +one-dimensional arrays of row pointers, column indices and values, where +the two former are of integer type and the latter of float type, usually +double. As the name suggests, non-zero entries are stored per row, where each +non-zero is defined by a pair of column index and corresponding value. The +column indices and values arrays therefore have a length equal to the total +number of non-zero entries. Row indices are given implicitly by the row +pointer array, which contains the starting index in the column index and +values arrays for the non-zero entries of each row. In other words, the +non-zeros for row i are at positions row_ptr[i] up to but not +including row_ptr[i+1] in the column index and values arrays. For each +row, entries are sorted by column index to allow for faster lookups using a +binary search.

+
+_images/csr.svg
+

A sparse matrix and its corresponding CSR row pointer, column indices and +values arrays

+
+
+

For distributed parallel storage with MPI, the rows of the matrix are +distribued evenly among the processors. Each row is then again divided into a +diagonal and an off-diagonal part, where the diagonal part comprises +columns i to j if i and j are the first and last row owned by +a given processor, and the off-diagonal part all other rows.

+
+_images/mpi_matrix.svg
+

Distribution of a sparse matrix among 3 MPI processes

+
+
+
+
+

Matrix assembly

+

Sparse matrices are assembled by adding up local contributions which are +mapped to global matrix entries via a local-to-global mapping represented by a +pair of Maps for the row and column space.

+
+_images/assembly.svg
+

Assembly of a local tensor A^K into a global matrix A using +the local-to-global mapping \iota_K^1 for rows and \iota_K^2 +for columns

+
+
+

For each par_loop() that assembles a matrix, PyOP2 generates a +call to PETSc’s MatSetValues function for each element of the iteration set, +adding the local contributions computed by the user kernel to the global +matrix using the given Maps. At the end of the +par_loop() PyOP2 automatically calls MatAssemblyBegin and +MatAssemblyEnd to finalise matrix assembly.

+

Consider assembling a Mat on a Sparsity built +from a Map from elements to nodes. The assembly is +done in a par_loop() over elements, where the +Mat A is accssed indirectly via the elem_node +Map using the IterationIndex +i:

+
nodes = op2.Set(NUM_NODES, "nodes")
+elements = op2.Set(NUM_ELE, "elements")
+
+elem_node = op2.Map(elements, nodes, 3, ...)
+
+sparsity = op2.Sparsity((nodes, nodes), (elem_node, elem_node))
+A = op2.Mat(sparsity, np.float64)
+
+b = op2.Dat(nodes, dtype=np.float64)
+
+# Assemble the matrix mat
+op2.par_loop(mat_kernel, elements,
+             A(op2.INC, (elem_node[op2.i[0]], elem_node[op2.i[1]])),
+             ...)
+
+# Assemble the right-hand side vector b
+op2.par_loop(rhs_kernel, elements,
+             b(op2.INC, elem_node[op2.i[0]]),
+             ...)
+
+
+

The code generated for the par_loop() assembling the +Mat for the sequential backend is similar to the following, +where initialisation and staging code described in Sequential backend +have been omitted for brevity. For each element of the iteration +Set a buffer for the local tensor is initialised to zero and +passed to the user kernel performing the local assembly operation. The +addto_vector call subsequently adds this local contribution to the global +sparse matrix.

+
void wrap_mat_kernel__(...) {
+  ...
+  for ( int n = start; n < end; n++ ) {
+    int i = n;
+    ...
+    double buffer_arg0_0[3][3] = {{0}};     // local tensor initialised to 0
+    mat_kernel(buffer_arg0_0, ...);         // local assembly kernel
+    addto_vector(arg0_0_0, buffer_arg0_0,   // Mat objet, local tensor
+                 3, arg0_0_map0_0 + i * 3,  // # rows, global row indices
+                 3, arg0_0_map1_0 + i * 3,  // # cols, global column indices
+                 0);                        // mode: 0 add, 1 insert
+  }
+}
+
+
+
+
+

Building a sparsity pattern

+

The sparsity pattern of a matrix is uniquely defined by the dimensions of the +DataSets forming its row and column space, and one or +more pairs of Maps defining its non-zero structure. This +is exploited in PyOP2 by caching sparsity patterns with these unique +attributes as the cache key to save expensive recomputation. Whenever a +Sparsity is initialised, an already computed pattern with the same +unique key is returned if it exists.

+

For a valid sparsity, each row Map must map to the set of the +row DataSet, each column Map to that of the +column DataSet and the from sets of each pair must match. A +matrix on a sparsity pattern built from more than one pair of maps is +assembled by multiple parallel loops iterating over the corresponding +iteration set for each pair.

+

Sparsity construction proceeds by iterating each Map pair and +building a set of indices of the non-zero columns for each row. Each pair of +entries in the row and column maps gives the row and column index of a +non-zero entry in the matrix and therefore the column index is added to the +set of non-zero entries for that particular row. The array of non-zero entries +per row is then determined as the size of the set for each row and its +exclusive scan yields the row pointer array. The column index array is the +concatenation of all the sets. An algorithm for the sequential case is given +below:

+
for rowmap, colmap in maps:
+    for e in range(rowmap.from_size):
+        for i in range(rowmap.arity):
+            row = rowmap.values[i + e*rowmap.arity]
+            for d in range(colmap.arity):
+                diag[row].insert(colmap.values[d + e * colmap.arity])
+
+
+

For the MPI parallel case a minor modification is required, since for each row +a set of diagonal and off-diagonal column indices needs to be built as +described in Sparse Matrix Storage Formats:

+
for rowmap, colmap in maps:
+    for e in range(rowmap.from_size):
+        for i in range(rowmap.arity):
+            row = rowmap.values[i + e*rowmap.arity]
+            if row < nrows:
+                for d in range(colmap.arity):
+                    if col < ncols:
+                        diag[row].insert(colmap.values[d + e*colmap.arity])
+                    else:
+                        odiag[row].insert(colmap.values[d + e*colmap.arity])
+
+
+
+
+

Solving a linear system

+

PyOP2 provides a Solver, wrapping the PETSc KSP Krylov +solvers which support various iterative methods such as Conjugate Gradients +(CG), Generalized Minimal Residual (GMRES), a stabilized version of +BiConjugate Gradient Squared (BiCGStab) and others. The solvers are +complemented with a range of preconditioners from PETSc’s PC collection, +which includes Jacobi, incomplete Cholesky and LU decompositions and various +multigrid based preconditioners.

+

The choice of solver and preconditioner type and other parameters uses +PETSc’s configuration mechanism documented in the PETSc manual. Options +are pased to the Solver via the keyword argument +parameters taking a dictionary of arguments or directly via keyword +arguments. The solver type is chosen as ksp_type, the preconditioner as +pc_type with the defaults cg and jacobi.

+

Solving a linear system of the matrix A assembled above and the right-hand +side vector b for a solution vector x is done with a call to +solve(), where solver and preconditioner are chosen as +gmres and ilu:

+
x = op2.Dat(nodes, dtype=np.float64)
+
+solver = op2.Solver(ksp_type='gmres', pc_type='ilu')
+solver.solve(A, x, b)
+
+
+
+
+

GPU matrix assembly

+

In a par_loop() assembling a Mat on the GPU, the +local contributions are first computed for all elements of the iteration set +and stored in global memory in a structure-of-arrays (SoA) data layout such +that all threads can write the data out in a coalesced manner. For the example +above, the generated CUDA wrapper code is as follows, again omitting +initialisation and staging code described in CUDA backend. The user +kernel only computes a single element in the local iteration space as detailed +in Local iteration spaces.

+
__global__ void __mat_kernel_stub(...,
+                                  double *arg0,    // local matrix data array
+                                  int arg0_offset, // offset into the array
+                                  ... ) {
+  ... // omitted initialisation and shared memory staging code
+  for ( int idx = threadIdx.x; idx < nelem; idx += blockDim.x ) {
+    ... // omitted staging code
+    for ( int i0 = 0; i0 < 3; ++i0 ) {
+      for ( int i1 = 0; i1 < 3; ++i1 ) {
+        mass_cell_integral_0_otherwise(
+          (double (*)[1])(arg0 + arg0_offset + idx * 9 + i0 * 3 + i1 * 1),
+          ..., i0, i1);
+      }
+    }
+  }
+}
+
+
+

A separate CUDA kernel given below is launched afterwards to compress the data +into a sparse matrix in CSR storage format. Only the values array needs to be +computed, since the row pointer and column indices have already been computed +when building the sparsity on the host and subsequently transferred to GPU +memory. Memory for the local contributions and the values array only needs to +be allocated on the GPU.

+
__global__ void __lma_to_csr(double *lmadata,  // local matrix data array
+                             double *csrdata,  // CSR values array
+                             int *rowptr,      // CSR row pointer array
+                             int *colidx,      // CSR column indices array
+                             int *rowmap,      // row map array
+                             int rowmapdim,    // row map arity
+                             int *colmap,      // column map array
+                             int colmapdim,    // column map arity
+                             int nelems) {
+  int nentries_per_ele = rowmapdim * colmapdim;
+  int n = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( n >= nelems * nentries_per_ele ) return;
+
+  int e = n / nentries_per_ele;                        // set element
+  int i = (n - e * nentries_per_ele) / rowmapdim;      // local row
+  int j = (n - e * nentries_per_ele - i * colmapdim);  // local column
+
+  // Compute position in values array
+  int offset = pos(rowmap[e * rowmapdim + i], colmap[e * colmapdim + j],
+                   rowptr, colidx);
+  __atomic_add(csrdata + offset, lmadata[n]);
+}
+
+
+
+
+

GPU linear algebra

+

Linear algebra on the GPU with the cuda backend uses the Cusp library, +which does not support all solvers and preconditioners provided by PETSc. The +interface to the user is the same as for the sequential and openmp +backends. Supported solver types are CG (cg), GMRES (gmres) and +BiCGStab (bicgstab), with preconditioners of types Jacobi (jacobi), +Bridson approximate inverse (ainv) and asymptotic multigrid (amg). An +exception is raised if an unsupported solver or preconditioner type is +requested. A Cusp solver with the chosen parameters is automatically +generated when solve() is called.

+
+

Note

+

Distributed parallel linear algebra operations with MPI are currently not +supported by the cuda backend.

+
+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/mixed.html b/mixed.html new file mode 100644 index 000000000..3315218d1 --- /dev/null +++ b/mixed.html @@ -0,0 +1,250 @@ + + + + + + + + Mixed Types — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

Mixed Types

+

When solving linear systems of equations as they arise for instance in the +finite-element method (FEM), one is often interested in coupled solutions of +more than one quantity. In fluid dynamics, a common example is solving a +coupled system of velocity and pressure as it occurs in some formulations of +the Navier-Stokes equations.

+
+

Mixed Set, DataSet, Map and Dat

+

PyOP2 provides the mixed types MixedSet +MixedDataSet, MixedMap and +MixedDat for a Set, DataSet, +Map and Dat respectively. A mixed type is +constructed from a list or other iterable of its base type and provides the +same attributes and methods. Under most circumstances types and mixed types +behave the same way and can be treated uniformly. Mixed types allow iteration +over their constituent parts and for convenience the base types are also +iterable, yielding themselves.

+

A MixedSet is defined from a list of sets:

+
s1, s2 = op2.Set(N), op2.Set(M)
+ms = op2.MixedSet([s1, s2])
+
+
+

There are a number of equivalent ways of defining a +MixedDataSet:

+
mds = op2.MixedDataSet([s1, s2], (1, 2))
+mds = op2.MixedDataSet([s1**1, s2**2])
+mds = op2.MixedDataSet(ms, (1, 2))
+mds = ms**(1, 2)
+
+
+

A MixedDat with no associated data is defined in one of the +following ways:

+
md = op2.MixedDat(mds)
+md = op2.MixedDat([s1**1, s2**2])
+md = op2.MixedDat([op2.Dat(s1**1), op2.Dat(s2**2)])
+
+
+

Finally, a MixedMap is defined from a list of maps, all of +which must share the same source Set:

+
it = op2.Set(S)
+mm = op2.MixedMap([op2.Map(it, s1, 2), op2.Map(it, s2, 3)])
+
+
+
+
+

Block Sparsity and Mat

+

When declaring a Sparsity on pairs of mixed maps, the +resulting sparsity pattern has a square block structure with as many block +rows and columns as there are components in the MixedDataSet +forming its row and column space. In the most general case a +Sparsity is constructed as follows:

+
it = op2.Set(...)  # Iteration set
+sr0, sr1 = op2.Set(...), op2.Set(...)  # Sets for row spaces
+sc0, sc1 = op2.Set(...), op2.Set(...)  # Sets for column spaces
+# MixedMaps for the row and column spaces
+mr = op2.MixedMap([op2.Map(it, sr0, ...), op2.Map(it, sr1, ...)])
+mc = op2.MixedMap([op2.Map(it, sc0, ...), op2.Map(it, sc1, ...)])
+# MixedDataSets for the row and column spaces
+dsr = op2.MixedDataSet([sr0**1, sr1**1])
+dsc = op2.MixedDataSet([sc0**1, sc1**1])
+# Blocked sparsity
+sparsity = op2.Sparsity((dsr, dsc), [(mr, mc), ...])
+
+
+

The relationships of each component of the mixed maps and datasets to the +blocks of the Sparsity is shown in the following diagram:

+
+_images/mixed_sparsity.svg
+

The contribution of sets, maps and datasets to the blocked sparsity.

+
+
+

Block sparsity patterns are computed separately for each block as described in +Building a sparsity pattern and the same validity rules apply. A +Mat defined on a block Sparsity has the same +block structure, which is implemented using a PETSc MATNEST.

+
+
+

Mixed Assembly

+

When assembling into a MixedDat or a block +Mat, the Kernel produces a local tensor of the +same block structure, which is a combination of Local iteration spaces +of all its subblocks. This is entirely transparent to the kernel however, +which sees the combined local iteration space. PyOP2 ensures that indirectly +accessed data is gathered and scattered via the correct maps and packed +together into a contiguous vector to be passed to the kernel. Contributions +from the local tensor are assembled into the correct blocks of the +MixedDat or Mat.

+

Consider the following example par_loop() assembling a block +Mat:

+
it, cells, nodes = op2.Set(...), op2.Set(...), op2.Set(...)
+mds = op2.MixedDataSet([nodes, cells])
+mmap = op2.MixedMap([op2.Map(it, nodes, 2, ...), op2.Map(it, cells, 1, ...)])
+mat = op2.Mat(op2.Sparsity(mds, mmap))
+d = op2.MixedDat(mds)
+
+op2.par_loop(kernel, it,
+             mat(op2.INC, (mmap[op2.i[0]], mmap[op2.i[1]])),
+             d(op2.read, mmap))
+
+
+

The kernel for this par_loop() assembles a 3x3 local tensor +and is passed an input vector of length 3 for each iteration set element:

+
void kernel(double v[3][3] , double **d ) {
+  for (int i = 0; i<3; i++)
+    for (int j = 0; j<3; j++)
+      v[i][j] += d[i][0] * d[j][0];
+}
+
+
+

The top-left 2x2 block of the local tensor is assembled into the (0,0) block +of the matrix, the top-right 2x1 block into (0,1), the bottom-left 1x2 block +into (1,0) and finally the bottom-right 1x1 block into (1,1). Note that for +the (0,0) block only the first component of the MixedDat is +read and for the (1,1) block only the second component. For the (0,1) and +(1,0) blocks, both components of the MixedDat are accessed.

+

This diagram illustrates the assembly of the block Mat:

+
+_images/mixed_assembly.svg
+

Assembling into the blocks of a global matrix A: block +A^{0,0} uses maps \iota^{1,0} and \iota^{2,0}, +A^{0,1} uses \iota^{1,0} and \iota^{2,1}, +A^{1,0} uses \iota^{1,1} and \iota^{2,0} and finally +A^{1,1} uses \iota^{1,1} and \iota^{2,1} for the row +and column spaces respectively.

+
+
+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/mpi.html b/mpi.html new file mode 100644 index 000000000..df80ad773 --- /dev/null +++ b/mpi.html @@ -0,0 +1,234 @@ + + + + + + + + MPI — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

MPI

+

Distributed parallel computations with MPI in PyOP2 require the mesh to be +partitioned among the processors. To be able to compute over entities on their +boundaries, partitions need to access data owned by neighboring processors. +This region, called the halo, needs to be kept up to date and is therefore +exchanged between the processors as required.

+
+

Local Numbering

+

The partition of each Set local to each process consists of +entities owned by the process and the halo, which are entities owned by +other processes but required to compute on the boundary of the owned entities. +Each of these sections is again divided into two sections required to +efficiently overlap communication and computation and avoid communication +during matrix assembly as described below. Each locally stored +Set entitity therefore belongs to one of four categories:

+
    +
  • Core: Entities owned by this processor which can be processed without +accessing halo data.

  • +
  • Owned: Entities owned by this processor which access halo data when +processed.

  • +
  • Exec halo: Off-processor entities which are redundantly executed over +because they touch owned entities.

  • +
  • Non-exec halo: Off-processor entities which are not processed, but read +when computing the exec halo.

  • +
+

The following diagram illustrates the four sections for a mesh distributed +among two processors:

+
+_images/pyop2_mpi_mesh.svg
+

A mesh distributed among two processors with the entities of each mesh +partition divided into core, owned, exec halo and non-exec halo. +Matching halo sections are highlighted in matching colours. The owned +section of process 0 correspondonds to the non-exec section of process 1.

+
+
+

For data defined on the Set to be stored contiguously per +section, local Set entities must be numbered such that core +entities are first, followed by owned, exec halo and non-exec halo in that +order. A good partitioning maximises the size of the core section and +minimises the halo regions. We can therefore assume that the vast majority of +local Set entities are in the core section.

+
+
+

Computation-communication Overlap

+

The ordering of Set entities into four sections allow for a +very efficient overlap of computation and communication. Core entities that do +not access any halo data can be processed entirely without access to halo data +immediately after the halo exchange has been initiated. Execution over the +owned and exec halo regions requires up to date halo data and can only start +once the halo exchange is completed. Depending on the latency and bandwidth +of communication and the size of the core section relative to the halo, the +halo exchange may complete before the computation on the core section.

+

The entire process is given below:

+
halo_exchange_begin()                      # Initiate halo exchange
+maybe_set_dat_dirty()                      # Mark Dats as modified
+compute_if_not_empty(itset.core_part)      # Compute core region
+halo_exchange_end()                        # Wait for halo exchange
+compute_if_not_empty(itset.owned_part)     # Compute owned region
+reduction_begin()                          # Initiate reductions
+if needs_exec_halo:                        # Any indirect Dat not READ?
+    compute_if_not_empty(itset.exec_part)  # Compute exec halo region
+reduction_end()                            # Wait for reductions
+maybe_set_halo_update_needed()             # Mark halos as out of date
+assemble()                                 # Finalise matrix assembly
+
+
+

Any reductions depend on data from the core and owned sections and are +initiated as soon as the owned section has been processed and execute +concurrently with computation on the exec halo. Similar to +halo_exchange_begin and halo_exchange_end, reduction_begin and +reduction_end do no work at all if none of the par_loop() +arguments requires a reduction. If the par_loop() assembles a +Mat, the matrix assembly is finalised at the end.

+

By dividing entities into sections according to their relation to the halo, +there is no need to check whether or not a given entity touches the halo or +not during computations on each section. This avoids branching in kernels or +wrapper code and allows launching separate kernels for GPU execution of each +section. The par_loop() execution therefore has the above +structure for all backends.

+
+
+

Halo exchange

+

Exchanging halo data is only required if the halo data is actually read, which +is the case for Dat arguments to a par_loop() +used in pyop2.READ or pyop2.RW mode. PyOP2 keeps track +whether or not the halo region may have been modified. This is the case for +Dats used in pyop2.INC, pyop2.WRITE or +pyop2.RW mode or when a Solver or a user requests +access to the data. A halo exchange is triggered only for halos marked as out +of date.

+
+
+

Distributed Assembly

+

For an MPI distributed matrix or vector, assembling owned entities at the +boundary can contribute to off-process degrees of freedom and vice versa.

+

There are different ways of accounting for these off-process contributions. +PETSc supports insertion and subsequent communication of off-process matrix +and vector entries, however its implementation is not thread safe. Concurrent +insertion into PETSc MPI matrices is thread safe if off-process insertions +are not cached and concurrent writes to rows are avoided, which is done +through colouring as described in Colouring.

+

PyOP2 therefore disables PETSc’s off-process insertion feature and instead +redundantly computes over all off process entities that touch local dofs, +which is the exec halo section described above. The price for this is +maintaining a larger halo, since we also need halo data, the non-exec halo +section, to perform the redundant computation. Halos grow by about a factor +two, however in practice this is still small compared to the interior region +of a partition and the main cost of halo exchange is the latency, which is +independent of the exchanged data volume.

+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/objects.inv b/objects.inv new file mode 100644 index 0000000000000000000000000000000000000000..a5d9a2f74f8d97f99f3f6437377d9d9c1fab0d58 GIT binary patch literal 3018 zcmV;*3pMm3AX9K?X>NERX>N99Zgg*Qc_4OWa&u{KZXhxWBOp+6Z)#;@bUGkVc~4L> z3L_v^WpZ%ZEX>4U6X>%ZB zZ*6dLWpi_7WFU2OX>MmAdTeQ8E(&RnQOTPRk{IBmnsjB?Dy|((tafdH?xQ zLzq&X*!GHJKsn2wcvgsxOzOD5%T4oe_0$wNkmQPn=xfP#Jd3JQ!4y-6mdn|l8x-r0 zE&JZ{&+z}%UL8bCX1P?R;dgzV8=sgZ3bl-q)qT|-vO>8^&es}AR*FifS*7xqoVVIw zeoGFL^-U+#LAT^x*EObb!p-Xif*Vw=D2)>!^&>Yz5#BO|u03ekBd6&};y+p0Bh{ip zmwaCr{Dr2*SNvXa=xvS%{EK%rFVWS1U@b+9ulEDNQ6HLu&|)80#nDAm;sdn!T0e@P zgj%<(lQ6KoYr2Xl5&~a`ehU+;373z(`nY@3^mQR;QOah&=iTh#+?wk@bLIl@%yqu$h$zk?GfbF{AdB=A%$!k%^S;NRvvogz@s+uj5nfVYUQoL%j zSH)e-QlK+-q7?Ohc5?;AT8WAmS*Y0TqMWefdndd0Y{yEe*nU=S<8q=rckXudfcEz3 z#~e%7H-RvH!V%!2a((~d_H^DooA7Y+Zz|#b1Cj9Q?&d8O@s$Yp`sx1j9hTv`L8rVC zD%bR}D%;$VD&NGhvabkHXv(=dutP-==+-w|SOT$Jy|s+ladf0z=Y^=EyZ;rWX%8nB ziV+0UWI%wSspXwIU>RB0B*J7rgnG)JSlP%V#MHPZ5hiiP+!h4Mol>}DSg(^PyN>gp z+_VnOzjs~plh+AA`?A?WGNKwt;Bxq93$uQm7Rl>^1{8Jel3p~TY*;}hg_?<9VwER*04CD#;#<~oc1vCi6w|Q*_MJng7kH=$o(ZWi4hs!f za=a*1p??bg15>cp;d22Gz|P`hJ#q#=LJ()Q2`we6Nq|UIA!29i3S`f$6a}Rg_)+6Q zyzl}OXI_X*TZ0@vWY4^lZq*&z0%Hy%1CU{J%pm&K6ql6Mq+q115;oHvAm|h|AV{-Z zpd`vE&+<@tQlQbsnSe2yj1{j{b`-bBY9W){pih*ap5@URosgLZ20oPw2x1-=IHa^H zP;jZ>(b6wws$yBrx=x&YgD`8GC;-e`%G&{ks>K+BP8QG5uw&tAT++G~Y3r6|AUe&i zSw*brUDc|CMuOLi8t*E9^4#t)C*=@xuCrJe9}?rx!_7q~D#^P@MCDaJ_dyLwp!Hwa zh<1yS7C!)uAxRi?#7^+C$b`gAxf4{$4o$CvB&VbH6>I4flb9rVHuUf|hDfLm>#{Tt zhg<|q7=68kHu}a3Vx4|Bi614jCIni5O;HI+sXz(hrupcEL~+q62)l%RjiP#}Ue=>>E-lpG&y|YR zaXvj!u8STTBWEh|HIaeXV2v-_>d@P569&2Z_}#WC1odFQ^U9@7Obca3%z(3xMEHIf$(mNDCDj z$nbj9^3{u*-HuYWm1p1k270l^b8zcjuT~A^83<9BF8cI}9TkF3NQ%ROK?ue%aW-WTgU!PT5$6$vu=5B?>`fWO0Nn7d0`uq?#J-T!+jMs?s_P40 z#BTW>Y4z4B_;~bVpT=a6)3XB^*0K)cqQeF|rcr_ta{P{GsBQi!2jx#w1ZwPkBI^p?Q;*nmLIa1#X4OEC_b%r}Eu>d}KnAl0D|yB&wgAdh?j;`Qj5&|*;+B9GrS z8{{T?48MzyuRS3iNuj94%@6HBr0%EBJfz`(HWC>1p@L59M4nOokUkcy`x@&<8{ZG{ ze(Tbhm;Y24ewqTI7rqMUzTn6X~7 zQtW3P6ynTfruw$zaf+G8rCEVFO%KLMKd41!Z_ONVkxstg*xNn|JA0WiR zWA`#r6Hx{v8ls(V4%StKwf^qI?l50qUcNEKvn`W+W^YV1Yv|AXTWGV(CkP(t!)|a) zp++>68eOQUV@nh@p}4S17*nDV5emh$E3#R62mY3IJiOmL?|1g}lvTlH4k4f#NYkV$ zek<9Qm&?^JOUT0YDO+LWbB70w>=&%dpM-*mNgr6L#YK~A>G-;lFE@ven_rhVqwBB5 zq2yO`DE$Or%kpo$E>LzaqI3wNEqcrFm%gE0%g1wxo(>jGww=Pty_0(DmdHw$D+PqnS0(# zjiBvT-tP9x+Z%s>VZN?%u}J-j-I%THmnWyz-^?+F6XT_|TiD-T=tbnE0u0$&neWF~c~)TG>t5A3GrI$1mDP-*w9SJqia$fXmCf-xYq|=iZ_VdmwAk}n=ri@E z41d+Sh&gm`v2XhfMl4=!%fr;Vgta;AGFI;Swqv$juwDi0-#1^DQ1?!}Ed4culTG=| z=drA*23d=xvsgSeQq5`!Z8)^UAMj~tg`o*tjEtWCD=+f~7M`lHZvVgk{1?W(ChzsH z$IMBqe*?XQ^^iTeb&46T`?fS!bdcHzg%oHk=&af2C+>pmUZ?vNFNFTm;l;^K#CtXD!3L_Ypqcz7I2>zFW0(IaNF1JZu7|)xV~?=0afP{A|NI7>2;c zd|b38n+#m~vx`x0c=y8d-njh+ZVrl}t-VH%3Kvu`&6Z8yy~<_A`de>e6=CP?qG&U+ zvbv5B->^C>vr+wNWXg0lsMy|c!dar@<_Twa+BWhZ7bAbW;)t6AB=GgrA5Bg^DJLZ9 z_TME8INucLVD@ao*nyKWdBvMvl!aycdrNn3(d$i*QK#z9V=prI MCn?P0|9&3z2-5!ePyhe` literal 0 HcmV?d00001 diff --git a/plan.html b/plan.html new file mode 100644 index 000000000..e7cd3fb4b --- /dev/null +++ b/plan.html @@ -0,0 +1,187 @@ + + + + + + + + Parallel Execution Plan — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

Parallel Execution Plan

+

For all PyOP2 backends with the exception of sequential, a parallel execution +plan is computed for each par_loop(). It contains information +guiding the code generator on how to partition, stage and colour the data for +efficient parallel processing.

+
+

Partitioning

+

The iteration set is split into a number of equally sized and contiguous +mini-partitions such that the working set of each mini-partition fits into +shared memory or last level cache. This is unrelated to the partitioning +required for MPI as described in MPI.

+
+
+

Local Renumbering and Staging

+

While a mini-partition is a contiguous chunk of the iteration set, the +indirectly accessed data it references is not necessarily contiguous. For each +mini-partition and unique Dat-Map pair, a +mapping from local indices within the partition to global indices is +constructed as the sorted array of unique Map indices accessed +by this partition. At the same time, a global-to-local mapping is constructed +as its inverse.

+

Data for indirectly accessed Dat arguments is staged in shared +device memory as described in PyOP2 Backends. For each partition, the +local-to-global mapping indicates where data to be staged in is read from and +the global-to-local mapping gives the location in shared memory data has been +staged at. The amount of shared memory required is computed from the size of +the local-to-global mapping.

+
+
+

Colouring

+

A two-level colouring is used to avoid race conditions. Partitions are +coloured such that partitions of the same colour can be executed concurrently +and threads executing on a partition in parallel are coloured such that no two +threads indirectly reference the same data. Only par_loop() +arguments performing an indirect reduction or assembling a matrix require +colouring. Matrices are coloured per row.

+

For each element of a Set indirectly accessed in a +par_loop(), a bit vector is used to record which colours +indirectly reference it. To colour each thread within a partition, the +algorithm proceeds as follows:

+
    +
  1. Loop over all indirectly accessed arguments and collect the colours of all +Set elements referenced by the current thread in a bit mask.

  2. +
  3. Choose the next available colour as the colour of the current thread.

  4. +
  5. Loop over all Set elements indirectly accessed by the +current thread again and set the new colour in their colour mask.

  6. +
+

Since the bit mask is a 32-bit integer, up to 32 colours can be processed in a +single pass, which is sufficient for most applications. If not all threads can +be coloured with 32 distinct colours, the mask is reset and another pass is +made, where each newly allocated colour is offset by 32. Should another pass +be required, the offset is increased to 64 and so on until all threads are +coloured.

+
+_images/pyop2_colouring.svg
+

Thread colouring within a mini-partition for a Dat on +vertices indirectly accessed in a computation over the edges. The edges are +coloured such that no two edges touch the same vertex within the partition.

+
+
+

The colouring of mini-partitions is done in the same way, except that all +Set elements indirectly accessed by the entire partition are +referenced, not only those accessed by a single thread.

+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/profiling.html b/profiling.html new file mode 100644 index 000000000..44c2f4767 --- /dev/null +++ b/profiling.html @@ -0,0 +1,287 @@ + + + + + + + + Profiling — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

Profiling

+
+

Profiling PyOP2 programs

+

Profiling a PyOP2 program is as simple as profiling any other Python +code. You can profile the jacobi demo in the PyOP2 demo folder as +follows:

+
python -m cProfile -o jacobi.dat jacobi.py
+
+
+

This will run the entire program under cProfile and write the profiling +data to jacobi.dat. Omitting -o will print a summary to stdout, +which is not very helpful in most cases.

+
+

Creating a graph

+

There is a much more intuitive way of representing the profiling data +using the excellent gprof2dot to generate a graph. Install from PyPI with

+
sudo pip install gprof2dot
+
+
+

Use as follows to create a PDF:

+
gprof2dot -f pstats -n 1 jacobi.dat | dot -Tpdf -o jacobi.pdf
+
+
+

-f pstats tells gprof2dot that it is dealing with Python +cProfile data (and not actual gprof data) and -n 1 ignores +everything that makes up less than 1% of the total runtime - most likely +you are not interested in that (the default is 0.5).

+
+
+

Consolidating profiles from different runs

+

To aggregate profiling data from different runs, save the following as +concat.py:

+
"""Usage: concat.py PATTERN FILE"""
+
+import sys
+from glob import glob
+from pstats import Stats
+
+if len(sys.argv) != 3:
+    print __doc__
+    sys.exit(1)
+files = glob(sys.argv[1])
+s = Stats(files[0])
+for f in files[1:]: s.add(f)
+s.dump_stats(sys.argv[2])
+
+
+

With profiles from different runs named <basename>.*.part, use it +as

+
python concat.py '<basename>.*.part' <basename>.dat
+
+
+

and then call gprof2dot as before.

+
+
+
+

Using PyOP2’s internal timers

+

PyOP2 automatically times the execution of certain regions:

+
    +
  • Sparsity building

  • +
  • Plan construction

  • +
  • Parallel loop kernel execution

  • +
  • Halo exchange

  • +
  • Reductions

  • +
  • PETSc Krylov solver

  • +
+

To output those timings, call summary() in your +PyOP2 program or run with the environment variable +PYOP2_PRINT_SUMMARY set to 1.

+

To query e.g. the timer for parallel loop execution programatically, +use the timing() helper:

+
from pyop2 import timing
+timing("ParLoop compute")               # get total time
+timing("ParLoop compute", total=False)  # get average time per call
+
+
+

To add additional timers to your own code, you can use the +timed_region() and +timed_function() helpers:

+
from pyop2.profiling import timed_region, timed_function
+
+with timed_region("my code"):
+    # my code
+
+@timed_function("my function")
+def my_func():
+    # my func
+
+
+
+
+

Line-by-line profiling

+

To get a line-by-line profile of a given function, install Robert Kern’s +line profiler and:

+
    +
  1. Import the profile() decorator:

    +
    from pyop2.profiling import profile
    +
    +
    +
  2. +
  3. Decorate the function to profile with @profile

  4. +
  5. Run your script with kernprof.py -l <script.py>

  6. +
  7. Generate an annotated source file with

    +
    python -m line_profiler <script.py.lprof>
    +
    +
    +
  8. +
+

Note that kernprof.py injects the @profile decorator into the +Python builtins namespace. PyOP2 provides a passthrough version of this +decorator which does nothing if profile is not found in +__builtins__. This means you can run your script regularly without +having to remove the decorators again.

+

The profile() decorator also works with the +memory profiler (see below). PyOP2 therefore provides the +lineprof() decorator which is only enabled when +running with kernprof.py.

+

A number of PyOP2 internal functions are decorated such that running +your PyOP2 application with kernprof.py will produce a line-by-line +profile of the parallel loop computation (but not the generated code!).

+
+
+

Memory profiling

+

To profile the memory usage of your application, install Fabian +Pedregosa’s memory profiler and:

+
    +
  1. Import the profile() decorator:

    +
    from pyop2.profiling import profile
    +
    +
    +
  2. +
  3. Decorate the function to profile with @profile.

  4. +
  5. Run your script with

    +
    python -m memory_profiler <script.py>
    +
    +
    +

    to get a line-by-line memory profile of your function.

    +
  6. +
  7. Run your script with

    +
    memprof run --python <script.py>
    +
    +
    +

    to record memory usage of your program over time.

    +
  8. +
  9. Generate a plot of the memory profile with memprof plot.

  10. +
+

Note that memprof and python -m memory_profiler inject the +@profile decorator into the Python builtins namespace. PyOP2 +provides a passthrough version of this decorator which does nothing if +profile is not found in __builtins__. This means you can run +your script regularly without having to remove the decorators again.

+

The profile() decorator also works with the line +profiler (see below). PyOP2 therefore provides the +memprof() decorator which is only enabled when +running with memprof.

+

A number of PyOP2 internal functions are decorated such that running +your PyOP2 application with memprof run will produce a memory +profile of the parallel loop computation (but not the generated code!).

+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/py-modindex.html b/py-modindex.html new file mode 100644 index 000000000..1316dbd48 --- /dev/null +++ b/py-modindex.html @@ -0,0 +1,173 @@ + + + + + + + Python Module Index — PyOP2 2020.0 documentation + + + + + + + + + + + + + + + +
+
+
+
+ + +

Python Module Index

+ +
+ p +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 
+ p
+ pyop2 +
    + pyop2.configuration +
    + pyop2.datatypes +
    + pyop2.exceptions +
    + pyop2.logger +
    + pyop2.mpi +
    + pyop2.profiling +
    + pyop2.types.access +
    + pyop2.types.dat +
    + pyop2.types.data_carrier +
    + pyop2.types.dataset +
    + pyop2.types.glob +
    + pyop2.types.halo +
    + pyop2.types.map +
    + pyop2.types.set +
    + pyop2.utils +
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/pyop2.codegen.html b/pyop2.codegen.html new file mode 100644 index 000000000..bcf8dd49b --- /dev/null +++ b/pyop2.codegen.html @@ -0,0 +1,155 @@ + + + + + + + + pyop2.codegen package — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

pyop2.codegen package

+
+

Submodules

+
+
+

pyop2.codegen.builder module

+
+
+

pyop2.codegen.loopycompat module

+
+
+

pyop2.codegen.node module

+
+
+

pyop2.codegen.optimise module

+
+
+

pyop2.codegen.rep2loopy module

+
+
+

pyop2.codegen.representation module

+
+
+

Module contents

+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/pyop2.html b/pyop2.html new file mode 100644 index 000000000..119e9c0f7 --- /dev/null +++ b/pyop2.html @@ -0,0 +1,1172 @@ + + + + + + + + pyop2 package — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

pyop2 package

+
+

Subpackages

+
+ +
+
+
+

Submodules

+
+
+

pyop2.caching module

+
+
+

pyop2.compilation module

+
+
+

pyop2.configuration module

+

PyOP2 global configuration.

+
+
+class pyop2.configuration.Configuration
+

Bases: dict

+

PyOP2 configuration parameters

+
+
Parameters:
+
    +
  • cc – C compiler (executable name eg: gcc +or path eg: /opt/gcc/bin/gcc).

  • +
  • cxx – C++ compiler (executable name eg: g++ +or path eg: /opt/gcc/bin/g++).

  • +
  • ld – Linker (executable name ld +or path eg: /opt/gcc/bin/ld).

  • +
  • cflags – extra flags to be passed to the C compiler.

  • +
  • cxxflags – extra flags to be passed to the C++ compiler.

  • +
  • ldflags – extra flags to be passed to the linker.

  • +
  • simd_width – number of doubles in SIMD instructions +(e.g. 4 for AVX2, 8 for AVX512).

  • +
  • debug – Turn on debugging for generated code (turns off +compiler optimisations).

  • +
  • type_check – Should PyOP2 type-check API-calls? (Default, +yes)

  • +
  • check_src_hashes – Should PyOP2 check that generated code is +the same on all processes? (Default, yes). Uses an allreduce.

  • +
  • cache_dir – Where should generated code be cached?

  • +
  • node_local_compilation

    Should generated code by compiled +“node-local” (one process for each set of processes that share

    +
    +

    a filesystem)? You should probably arrange to set cache_dir +to a node-local filesystem too.

    +
    +

  • +
  • log_level – How chatty should PyOP2 be? Valid values +are “DEBUG”, “INFO”, “WARNING”, “ERROR”, “CRITICAL”.

  • +
  • print_cache_size – Should PyOP2 print the size of caches at +program exit?

  • +
  • matnest – Should matrices on mixed maps be built as nests? (Default yes)

  • +
  • block_sparsity – Should sparsity patterns on datasets with +cdim > 1 be built as block sparsities, or dof sparsities. The +former saves memory but changes which preconditioners are +available for the resulting matrices. (Default yes)

  • +
+
+
+
+
+cache_dir = '/tmp/pyop2-cache-uid1001'
+
+ +
+
+DEFAULTS = {'block_sparsity': ('PYOP2_BLOCK_SPARSITY', <class 'bool'>, True), 'cache_dir': ('PYOP2_CACHE_DIR', <class 'str'>, '/tmp/pyop2-cache-uid1001'), 'cc': ('PYOP2_CC', <class 'str'>, ''), 'cflags': ('PYOP2_CFLAGS', <class 'str'>, ''), 'check_src_hashes': ('PYOP2_CHECK_SRC_HASHES', <class 'bool'>, True), 'compute_kernel_flops': ('PYOP2_COMPUTE_KERNEL_FLOPS', <class 'bool'>, False), 'cxx': ('PYOP2_CXX', <class 'str'>, ''), 'cxxflags': ('PYOP2_CXXFLAGS', <class 'str'>, ''), 'debug': ('PYOP2_DEBUG', <class 'bool'>, False), 'ld': ('PYOP2_LD', <class 'str'>, ''), 'ldflags': ('PYOP2_LDFLAGS', <class 'str'>, ''), 'log_level': ('PYOP2_LOG_LEVEL', (<class 'str'>, <class 'int'>), 'WARNING'), 'matnest': ('PYOP2_MATNEST', <class 'bool'>, True), 'no_fork_available': ('PYOP2_NO_FORK_AVAILABLE', <class 'bool'>, False), 'node_local_compilation': ('PYOP2_NODE_LOCAL_COMPILATION', <class 'bool'>, True), 'print_cache_size': ('PYOP2_PRINT_CACHE_SIZE', <class 'bool'>, False), 'simd_width': ('PYOP2_SIMD_WIDTH', <class 'int'>, 4), 'type_check': ('PYOP2_TYPE_CHECK', <class 'bool'>, True)}
+

Default values for PyOP2 configuration parameters

+
+ +
+
+reset()
+

Reset the configuration parameters to the default values.

+
+ +
+
+reconfigure(**kwargs)
+

Update the configuration parameters with new values.

+
+ +
+
+unsafe_reconfigure(**kwargs)
+

“Unsafely reconfigure (just replacing the values)

+
+ +
+ +
+
+

pyop2.datatypes module

+
+
+pyop2.datatypes.as_cstr(dtype)
+

Convert a numpy dtype like object to a C type as a string.

+
+ +
+
+pyop2.datatypes.as_ctypes(dtype)
+

Convert a numpy dtype like object to a ctypes type.

+
+ +
+
+pyop2.datatypes.as_numpy_dtype(dtype)
+

Convert a dtype-like object into a numpy dtype.

+
+ +
+
+pyop2.datatypes.dtype_limits(dtype)
+

Attempt to determine the min and max values of a datatype.

+
+
Parameters:
+

dtype – A numpy datatype.

+
+
Returns:
+

a 2-tuple of min, max

+
+
Raises:
+

ValueError – If numeric limits could not be determined.

+
+
+
+ +
+
+class pyop2.datatypes.OpaqueType(name)
+

Bases: OpaqueType

+
+ +
+
+

pyop2.exceptions module

+

OP2 exception types

+
+
+exception pyop2.exceptions.DataTypeError
+

Bases: TypeError

+

Invalid type for data.

+
+ +
+
+exception pyop2.exceptions.DimTypeError
+

Bases: TypeError

+

Invalid type for dimension.

+
+ +
+
+exception pyop2.exceptions.ArityTypeError
+

Bases: TypeError

+

Invalid type for arity.

+
+ +
+
+exception pyop2.exceptions.IndexTypeError
+

Bases: TypeError

+

Invalid type for index.

+
+ +
+
+exception pyop2.exceptions.NameTypeError
+

Bases: TypeError

+

Invalid type for name.

+
+ +
+
+exception pyop2.exceptions.SetTypeError
+

Bases: TypeError

+

Invalid type for pyop2.op2.Set.

+
+ +
+
+exception pyop2.exceptions.SizeTypeError
+

Bases: TypeError

+

Invalid type for size.

+
+ +
+
+exception pyop2.exceptions.SubsetIndexOutOfBounds
+

Bases: TypeError

+

Out of bound index.

+
+ +
+
+exception pyop2.exceptions.SparsityTypeError
+

Bases: TypeError

+

Invalid type for pyop2.op2.Sparsity.

+
+ +
+
+exception pyop2.exceptions.MapTypeError
+

Bases: TypeError

+

Invalid type for pyop2.op2.Map.

+
+ +
+
+exception pyop2.exceptions.DataSetTypeError
+

Bases: TypeError

+

Invalid type for pyop2.op2.DataSet.

+
+ +
+
+exception pyop2.exceptions.MatTypeError
+

Bases: TypeError

+

Invalid type for pyop2.op2.Mat.

+
+ +
+
+exception pyop2.exceptions.DatTypeError
+

Bases: TypeError

+

Invalid type for pyop2.op2.Dat.

+
+ +
+
+exception pyop2.exceptions.KernelTypeError
+

Bases: TypeError

+

Invalid type for pyop2.op2.Kernel.

+
+ +
+
+exception pyop2.exceptions.DataValueError
+

Bases: ValueError

+

Illegal value for data.

+
+ +
+
+exception pyop2.exceptions.IndexValueError
+

Bases: ValueError

+

Illegal value for index.

+
+ +
+
+exception pyop2.exceptions.ModeValueError
+

Bases: ValueError

+

Illegal value for mode.

+
+ +
+
+exception pyop2.exceptions.IterateValueError
+

Bases: ValueError

+

Illegal value for iterate.

+
+ +
+
+exception pyop2.exceptions.SetValueError
+

Bases: ValueError

+

Illegal value for pyop2.op2.Set.

+
+ +
+
+exception pyop2.exceptions.MapValueError
+

Bases: ValueError

+

Illegal value for pyop2.op2.Map.

+
+ +
+
+exception pyop2.exceptions.ConfigurationError
+

Bases: RuntimeError

+

Illegal configuration value or type.

+
+ +
+
+exception pyop2.exceptions.CompilationError
+

Bases: RuntimeError

+

Error during JIT compilation

+
+ +
+
+exception pyop2.exceptions.SparsityFormatError
+

Bases: ValueError

+

Unable to produce a sparsity for this matrix format.

+
+ +
+
+

pyop2.global_kernel module

+
+
+

pyop2.local_kernel module

+
+
+

pyop2.logger module

+

The PyOP2 logger, based on the Python standard library logging module.

+
+
+pyop2.logger.set_log_level(level)
+

Set the log level of the PyOP2 logger.

+
+
Parameters:
+

level – the log level. Valid values: DEBUG, INFO, WARNING, ERROR, CRITICAL

+
+
+
+ +
+
+pyop2.logger.log(level, msg, *args, **kwargs)
+

Print ‘msg % args’ with the severity ‘level’.

+
+
Parameters:
+
    +
  • level – the log level. Valid values: DEBUG, INFO, WARNING, ERROR, CRITICAL

  • +
  • msg – the message

  • +
+
+
+
+ +
+
+pyop2.logger.progress(level, msg, *args, **kwargs)
+

A context manager to print a progress message.

+

The block is wrapped in msg..., msg...done log messages +with an appropriate indent (to distinguish nested message).

+
+
Parameters:
+
    +
  • level – the log level. See log() for valid values

  • +
  • msg – the message.

  • +
+
+
+

See log() for more details.

+
+ +
+
+

pyop2.mpi module

+

PyOP2 MPI communicator.

+
+
+pyop2.mpi.internal_comm(comm, obj)
+

Creates an internal comm from the user comm. +If comm is None, create an internal communicator from COMM_WORLD +:arg comm: A communicator or None +:arg obj: The object which the comm is an attribute of +(usually self)

+
+
Returns pyop2_comm:
+

A PyOP2 internal communicator

+
+
+
+ +
+
+pyop2.mpi.is_pyop2_comm(comm)
+

Returns True if comm is a PyOP2 communicator, +False if comm another communicator. +Raises exception if comm is not a communicator.

+
+
Parameters:
+

comm – Communicator to query

+
+
+
+ +
+
+pyop2.mpi.incref(comm)
+

Increment communicator reference count

+
+ +
+
+pyop2.mpi.decref(comm)
+

Decrement communicator reference count

+
+ +
+
+class pyop2.mpi.temp_internal_comm(comm)
+

Bases: object

+

Use a PyOP2 internal communicator and +increment and decrement the internal comm. +:arg comm: Any communicator

+
+ +
+
+

pyop2.op2 module

+
+
+

pyop2.parloop module

+
+
+

pyop2.profiling module

+
+
+pyop2.profiling.timed_stage()
+

Enter a code Stage, this is a PETSc log Stage.

+
+
Parameters:
+

name – The name of the stage.

+
+
+
+ +
+
+pyop2.profiling.timed_region()
+

Time a code region, this a PETSc log Event.

+
+
Parameters:
+

name – The name of the region.

+
+
+
+ +
+
+class pyop2.profiling.timed_function(name=None)
+

Bases: object

+
+ +
+
+

pyop2.sparsity module

+
+
+

pyop2.utils module

+

Common utility classes/functions.

+
+
+class pyop2.utils.cached_property(fget, doc=None)
+

Bases: object

+

A read-only @property that is only evaluated once. The value is cached +on the object itself rather than the function or class; this should prevent +memory leakage.

+
+ +
+
+pyop2.utils.as_tuple(item, type=None, length=None, allow_none=False)
+
+ +
+
+pyop2.utils.as_type(obj, typ)
+

Return obj if it is of dtype typ, otherwise return a copy type-cast to +typ.

+
+ +
+
+pyop2.utils.tuplify(xs)
+

Turn a data structure into a tuple tree.

+
+ +
+
+class pyop2.utils.validate_base(*checks)
+

Bases: object

+

Decorator to validate arguments

+

Formal parameters that don’t exist in the definition of the function +being decorated as well as actual arguments not being present when +the validation is called are silently ignored.

+
+
+check_args(args, kwargs)
+
+ +
+ +
+
+class pyop2.utils.validate_type(*checks)
+

Bases: validate_base

+

Decorator to validate argument types

+

The decorator expects one or more arguments, which are 3-tuples of +(name, type, exception), where name is the argument name in the +function being decorated, type is the argument type to be validated +and exception is the exception type to be raised if validation fails.

+
+
+check_arg(arg, argtype, exception)
+
+ +
+ +
+
+class pyop2.utils.validate_in(*checks)
+

Bases: validate_base

+

Decorator to validate argument is in a set of valid argument values

+

The decorator expects one or more arguments, which are 3-tuples of +(name, list, exception), where name is the argument name in the +function being decorated, list is the list of valid argument values +and exception is the exception type to be raised if validation fails.

+
+
+check_arg(arg, values, exception)
+
+ +
+ +
+
+class pyop2.utils.validate_range(*checks)
+

Bases: validate_base

+

Decorator to validate argument value is in a given numeric range

+

The decorator expects one or more arguments, which are 3-tuples of +(name, range, exception), where name is the argument name in the +function being decorated, range is a 2-tuple defining the valid argument +range and exception is the exception type to be raised if validation +fails.

+
+
+check_arg(arg, range, exception)
+
+ +
+ +
+
+class pyop2.utils.validate_dtype(*checks)
+

Bases: validate_base

+

Decorator to validate argument value is in a valid Numpy dtype

+

The decorator expects one or more arguments, which are 3-tuples of +(name, _, exception), where name is the argument name in the +function being decorated, second argument is ignored and exception +is the exception type to be raised if validation fails.

+
+
+check_arg(arg, ignored, exception)
+
+ +
+ +
+
+pyop2.utils.verify_reshape(data, dtype, shape, allow_none=False)
+

Verify data is of type dtype and try to reshaped to shape.

+
+ +
+
+pyop2.utils.align(bytes, alignment=16)
+

Align BYTES to a multiple of ALIGNMENT

+
+ +
+
+pyop2.utils.flatten(iterable)
+

Flatten a given nested iterable.

+
+ +
+
+pyop2.utils.parser(description=None, group=False)
+

Create default argparse.ArgumentParser parser for pyop2 programs.

+
+ +
+
+pyop2.utils.parse_args(*args, **kwargs)
+

Return parsed arguments as variables for later use.

+

ARGS and KWARGS are passed into the parser instantiation. +The only recognised options are group and description.

+
+ +
+
+pyop2.utils.trim(docstring)
+

Trim a docstring according to PEP 257.

+
+ +
+
+pyop2.utils.strip(code)
+
+ +
+
+pyop2.utils.get_petsc_dir()
+
+ +
+
+

pyop2.version module

+
+
+

Module contents

+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/pyop2.types.html b/pyop2.types.html new file mode 100644 index 000000000..7f5c3659e --- /dev/null +++ b/pyop2.types.html @@ -0,0 +1,2472 @@ + + + + + + + + pyop2.types package — PyOP2 2020.0 documentation + + + + + + + + + + + + + +
+
+
+
+ +
+

pyop2.types package

+
+

Submodules

+
+
+

pyop2.types.access module

+
+
+class pyop2.types.access.Access(value)
+

Bases: IntEnum

+

An enumeration.

+
+
+READ = 1
+
+ +
+
+WRITE = 2
+
+ +
+
+RW = 3
+
+ +
+
+INC = 4
+
+ +
+
+MIN = 5
+
+ +
+
+MAX = 6
+
+ +
+ +
+
+pyop2.types.access.READ = Access.READ
+

The Global, Dat, or Mat is accessed read-only.

+
+ +
+
+pyop2.types.access.WRITE = Access.WRITE
+

The Global, Dat, or Mat is accessed write-only, +and OP2 is not required to handle write conflicts.

+
+ +
+
+pyop2.types.access.RW = Access.RW
+

The Global, Dat, or Mat is accessed for reading +and writing, and OP2 is not required to handle write conflicts.

+
+ +
+
+pyop2.types.access.INC = Access.INC
+

The kernel computes increments to be summed onto a Global, +Dat, or Mat. OP2 is responsible for managing the write +conflicts caused.

+
+ +
+
+pyop2.types.access.MIN = Access.MIN
+

The kernel contributes to a reduction into a Global using a min +operation. OP2 is responsible for reducing over the different kernel +invocations.

+
+ +
+
+pyop2.types.access.MAX = Access.MAX
+

The kernel contributes to a reduction into a Global using a max +operation. OP2 is responsible for reducing over the different kernel +invocations.

+
+ +
+
+

pyop2.types.dat module

+
+
+class pyop2.types.dat.AbstractDat(dataset, data=None, dtype=None, name=None)
+

Bases: DataCarrier, EmptyDataMixin, ABC

+

OP2 vector data. A Dat holds values on every element of a +DataSet.o

+

If a Set is passed as the dataset argument, rather +than a DataSet, the Dat is created with a default +DataSet dimension of 1.

+

If a Dat is passed as the dataset argument, a copy is +returned.

+

It is permissible to pass None as the data argument. In this +case, allocation of the data buffer is postponed until it is +accessed.

+
+

Note

+

If the data buffer is not passed in, it is implicitly +initialised to be zero.

+
+

When a Dat is passed to pyop2.op2.par_loop(), the map via +which indirection occurs and the access descriptor are passed by +calling the Dat. For instance, if a Dat named D is +to be accessed for reading via a Map named M, this is +accomplished by

+
D(pyop2.READ, M)
+
+
+

The Map through which indirection occurs can be indexed +using the index notation described in the documentation for the +Map. Direct access to a Dat is accomplished by +omitting the path argument.

+

Dat objects support the pointwise linear algebra operations ++=, *=, -=, /=, where *= and /= also support +multiplication / division by a scalar.

+
+
+split
+

Tuple containing only this Dat.

+
+ +
+
+dataset
+

DataSet on which the Dat is defined.

+
+ +
+
+dim
+

The shape of the values for each element of the object.

+
+ +
+
+cdim
+

The scalar number of values for each member of the object. This is +the product of the dim tuple.

+
+ +
+
+property data
+

Numpy array containing the data values.

+

With this accessor you are claiming that you will modify +the values you get back. If you only need to look at the +values, use data_ro() instead.

+

This only shows local values, to see the halo values too use +data_with_halos().

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_with_halos
+

A view of this Dats data.

+

This accessor marks the Dat as dirty, see +data() for more details on the semantics.

+

With this accessor, you get to see up to date halo values, but +you should not try and modify them, because they will be +overwritten by the next halo exchange.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_ro
+

Numpy array containing the data values. Read-only.

+

With this accessor you are not allowed to modify the values +you get back. If you need to do so, use data() instead.

+

This only shows local values, to see the halo values too use +data_ro_with_halos().

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_ro_with_halos
+

A view of this Dats data.

+

This accessor does not mark the Dat as dirty, and is +a read only view, see data_ro() for more details on the +semantics.

+

With this accessor, you get to see up to date halo values, but +you should not try and modify them, because they will be +overwritten by the next halo exchange.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_wo
+

Numpy array containing the data values that is only valid for writing to.

+

This only shows local values, to see the halo values too use +data_wo_with_halos().

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_wo_with_halos
+

Return a write-only view of all the data values.

+

This method, unlike data_with_halos(), avoids a halo exchange +if the halo is dirty.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+save(filename)
+

Write the data array to file filename in NumPy format.

+
+ +
+
+load(filename)
+

Read the data stored in file filename into a NumPy array +and store the values in _data().

+
+ +
+
+shape
+
+ +
+
+dtype
+
+ +
+
+nbytes
+

Return an estimate of the size of the data associated with this +Dat in bytes. This will be the correct size of the data +payload, but does not take into account the (presumably small) +overhead of the object and its metadata.

+

Note that this is the process local memory usage, not the sum +over all MPI processes.

+
+ +
+
+zero(subset=None)
+

Zero the data associated with this Dat

+
+
Parameters:
+

subset – A Subset of entries to zero (optional).

+
+
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+copy(other, subset=None)
+

Copy the data in this Dat into another.

+
+
Parameters:
+
    +
  • other – The destination Dat

  • +
  • subset – A Subset of elements to copy (optional)

  • +
+
+
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+inner(other)
+

Compute the l2 inner product of the flattened Dat

+
+
Parameters:
+

other – the other Dat to compute the inner +product against. The complex conjugate of this is taken.

+
+
+
+ +
+
+property norm
+

Compute the l2 norm of this Dat

+
+

Note

+

This acts on the flattened data (see also inner()).

+
+
+ +
+
+global_to_local_begin(access_mode)
+

Begin a halo exchange from global to ghosted representation.

+
+
Parameters:
+

access_mode – Mode with which the data will subsequently +be accessed.

+
+
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+global_to_local_end(access_mode)
+

End a halo exchange from global to ghosted representation.

+
+
Parameters:
+

access_mode – Mode with which the data will subsequently +be accessed.

+
+
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+local_to_global_begin(insert_mode)
+

Begin a halo exchange from ghosted to global representation.

+
+
Parameters:
+

insert_mode – insertion mode (an access descriptor)

+
+
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+local_to_global_end(insert_mode)
+

End a halo exchange from ghosted to global representation.

+
+
Parameters:
+

insert_mode – insertion mode (an access descriptor)

+
+
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+frozen_halo(access_mode)
+

Temporarily disable halo exchanges inside a context manager.

+
+
Parameters:
+

access_mode – Mode with which the data will subsequently be accessed.

+
+
+

This is useful in cases where one is repeatedly writing to a Dat with +the same access descriptor since the intermediate updates can be skipped.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+freeze_halo(access_mode)
+

Disable halo exchanges.

+
+
Parameters:
+

access_mode – Mode with which the data will subsequently be accessed.

+
+
+

Note that some bookkeeping is needed when freezing halos. Prefer to use the +Dat.frozen_halo() context manager.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+unfreeze_halo()
+

Re-enable halo exchanges.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+ +
+
+class pyop2.types.dat.DatView(dat, index)
+

Bases: AbstractDat

+

An indexed view into a Dat.

+

This object can be used like a Dat but the kernel will +only see the requested index, rather than the full data.

+
+
Parameters:
+
    +
  • dat – The Dat to create a view into.

  • +
  • index – The component to select a view of.

  • +
+
+
+
+
+cdim
+
+ +
+
+dim
+
+ +
+
+shape
+
+ +
+
+property halo_valid
+
+ +
+
+property data
+

Numpy array containing the data values.

+

With this accessor you are claiming that you will modify +the values you get back. If you only need to look at the +values, use data_ro() instead.

+

This only shows local values, to see the halo values too use +data_with_halos().

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_ro
+

Numpy array containing the data values. Read-only.

+

With this accessor you are not allowed to modify the values +you get back. If you need to do so, use data() instead.

+

This only shows local values, to see the halo values too use +data_ro_with_halos().

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_wo
+

Numpy array containing the data values that is only valid for writing to.

+

This only shows local values, to see the halo values too use +data_wo_with_halos().

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_with_halos
+

A view of this Dats data.

+

This accessor marks the Dat as dirty, see +data() for more details on the semantics.

+

With this accessor, you get to see up to date halo values, but +you should not try and modify them, because they will be +overwritten by the next halo exchange.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_ro_with_halos
+

A view of this Dats data.

+

This accessor does not mark the Dat as dirty, and is +a read only view, see data_ro() for more details on the +semantics.

+

With this accessor, you get to see up to date halo values, but +you should not try and modify them, because they will be +overwritten by the next halo exchange.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_wo_with_halos
+

Return a write-only view of all the data values.

+

This method, unlike data_with_halos(), avoids a halo exchange +if the halo is dirty.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+ +
+
+class pyop2.types.dat.Dat(*args, **kwargs)
+

Bases: AbstractDat, VecAccessMixin

+
+
+vec_context(access)
+

A context manager for a PETSc.Vec from a Dat.

+
+
Parameters:
+

access – Access descriptor: READ, WRITE, or RW.

+
+
+
+ +
+ +
+
+class pyop2.types.dat.MixedDat(mdset_or_dats)
+

Bases: AbstractDat, VecAccessMixin

+

A container for a bag of Dats.

+

Initialized either from a MixedDataSet, a MixedSet, or +an iterable of DataSets and/or Sets, where all the +Sets are implcitly upcast to DataSets

+
mdat = op2.MixedDat(mdset)
+mdat = op2.MixedDat([dset1, ..., dsetN])
+
+
+

or from an iterable of Dats

+
mdat = op2.MixedDat([dat1, ..., datN])
+
+
+
+
+property dat_version
+
+ +
+
+increment_dat_version()
+
+ +
+
+dtype
+

The NumPy dtype of the data.

+
+ +
+
+split
+

The underlying tuple of Dats.

+
+ +
+
+dataset
+

MixedDataSets this MixedDat is defined on.

+
+ +
+
+property data
+

Numpy arrays containing the data excluding halos.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_with_halos
+

Numpy arrays containing the data including halos.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_ro
+

Numpy arrays with read-only data excluding halos.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_ro_with_halos
+

Numpy arrays with read-only data including halos.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_wo
+

Numpy arrays with read-only data excluding halos.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property data_wo_with_halos
+

Numpy arrays with read-only data including halos.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property halo_valid
+

Does this Dat have up to date halos?

+
+ +
+
+global_to_local_begin(access_mode)
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+global_to_local_end(access_mode)
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+local_to_global_begin(insert_mode)
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+local_to_global_end(insert_mode)
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+freeze_halo(access_mode)
+

Disable halo exchanges.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+unfreeze_halo()
+

Re-enable halo exchanges.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+zero(subset=None)
+

Zero the data associated with this MixedDat.

+
+
Parameters:
+

subset – optional subset of entries to zero (not implemented).

+
+
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+nbytes
+

Return an estimate of the size of the data associated with this +MixedDat in bytes. This will be the correct size of the data +payload, but does not take into account the (presumably small) +overhead of the object and its metadata.

+

Note that this is the process local memory usage, not the sum +over all MPI processes.

+
+ +
+
+copy(other, subset=None)
+

Copy the data in this MixedDat into another.

+
+
Parameters:
+
    +
  • other – The destination MixedDat

  • +
  • subset – Subsets are not supported, this must be None

  • +
+
+
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+inner(other)
+

Compute the l2 inner product.

+
+
Parameters:
+

other – the other MixedDat to compute the inner product against

+
+
+
+ +
+
+vec_context(access)
+

A context manager scattering the arrays of all components of this +MixedDat into a contiguous PETSc.Vec and reverse +scattering to the original arrays when exiting the context.

+
+
Parameters:
+

access – Access descriptor: READ, WRITE, or RW.

+
+
+
+

Note

+

The Vec obtained from this context is in +the correct order to be left multiplied by a compatible +MixedMat. In parallel it is not just a +concatenation of the underlying Dats.

+
+
+ +
+ +
+
+class pyop2.types.dat.frozen_halo(dat, access_mode)
+

Bases: object

+

Context manager handling the freezing and unfreezing of halos.

+
+
Parameters:
+
    +
  • dat – The Dat whose halo is to be frozen.

  • +
  • access_mode – Mode with which the Dat will be accessed whilst +its halo is frozen.

  • +
+
+
+
+ +
+
+

pyop2.types.data_carrier module

+
+
+class pyop2.types.data_carrier.DataCarrier
+

Bases: ABC

+

Abstract base class for OP2 data.

+

Actual objects will be DataCarrier objects of rank 0 +(Global), rank 1 (Dat), or rank 2 +(Mat)

+
+
+dtype
+

The Python type of the data.

+
+ +
+
+ctype
+

The c type of the data.

+
+ +
+
+name
+

User-defined label.

+
+ +
+
+dim
+

The shape tuple of the values for each element of the object.

+
+ +
+
+cdim
+

The scalar number of values for each member of the object. This is +the product of the dim tuple.

+
+ +
+
+increment_dat_version()
+
+ +
+ +
+
+class pyop2.types.data_carrier.EmptyDataMixin(data, dtype, shape)
+

Bases: ABC

+

A mixin for Dat and Global objects that takes +care of allocating data on demand if the user has passed nothing +in.

+

Accessing the _data property allocates a zeroed data array +if it does not already exist.

+
+ +
+
+class pyop2.types.data_carrier.VecAccessMixin(petsc_counter=None)
+

Bases: ABC

+
+
+property dat_version
+
+ +
+
+abstract vec_context(access)
+
+ +
+
+property vec
+

Context manager for a PETSc Vec appropriate for this Dat.

+

You’re allowed to modify the data you get back from this view.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property vec_wo
+

Context manager for a PETSc Vec appropriate for this Dat.

+

You’re allowed to modify the data you get back from this view, +but you cannot read from it.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property vec_ro
+

Context manager for a PETSc Vec appropriate for this Dat.

+

You’re not allowed to modify the data you get back from this view.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+ +
+
+

pyop2.types.dataset module

+
+
+class pyop2.types.dataset.DataSet(*args, **kwargs)
+

Bases: ObjectCached

+

PyOP2 Data Set

+

Set used in the op2.Dat structures to specify the dimension of the data.

+
+
+dim
+

The shape tuple of the values for each element of the set.

+
+ +
+
+cdim
+

The scalar number of values for each member of the set. This is +the product of the dim tuple.

+
+ +
+
+name
+

Returns the name of the data set.

+
+ +
+
+set
+

Returns the parent set of the data set.

+
+ +
+
+lgmap
+

A PETSc LGMap mapping process-local indices to global +indices for this DataSet.

+
+ +
+
+scalar_lgmap
+
+ +
+
+unblocked_lgmap
+

A PETSc LGMap mapping process-local indices to global +indices for this DataSet with a block size of 1.

+
+ +
+
+field_ises
+

A list of PETSc ISes defining the global indices for each set in +the DataSet.

+

Used when extracting blocks from matrices for solvers.

+
+ +
+
+local_ises
+

A list of PETSc ISes defining the local indices for each set in the DataSet.

+

Used when extracting blocks from matrices for assembly.

+
+ +
+
+layout_vec
+

A PETSc Vec compatible with the dof layout of this DataSet.

+
+ +
+
+dm
+
+ +
+ +
+
+class pyop2.types.dataset.GlobalDataSet(*args, **kwargs)
+

Bases: DataSet

+

A proxy DataSet for use in a Sparsity where the +matrix has Global rows or columns.

+
+
Parameters:
+

global – The Global on which this object is based.

+
+
+
+
+dim
+

The shape tuple of the values for each element of the set.

+
+ +
+
+cdim
+

The scalar number of values for each member of the set. This is +the product of the dim tuple.

+
+ +
+
+name
+

Returns the name of the data set.

+
+ +
+
+set
+

Returns the parent set of the data set.

+
+ +
+
+size
+

The number of local entries in the Dataset (1 on rank 0)

+
+ +
+
+lgmap
+

A PETSc LGMap mapping process-local indices to global +indices for this DataSet.

+
+ +
+
+unblocked_lgmap
+

A PETSc LGMap mapping process-local indices to global +indices for this DataSet with a block size of 1.

+
+ +
+
+local_ises
+

A list of PETSc ISes defining the local indices for each set in the DataSet.

+

Used when extracting blocks from matrices for assembly.

+
+ +
+
+layout_vec
+

A PETSc Vec compatible with the dof layout of this DataSet.

+
+ +
+
+dm
+
+ +
+ +
+
+class pyop2.types.dataset.MixedDataSet(*args, **kwargs)
+

Bases: DataSet

+

A container for a bag of DataSets.

+

Initialized either from a MixedSet and an iterable or iterator of +dims of corresponding length

+
mdset = op2.MixedDataSet(mset, [dim1, ..., dimN])
+
+
+

or from a tuple of Sets and an iterable of dims of +corresponding length

+
mdset = op2.MixedDataSet([set1, ..., setN], [dim1, ..., dimN])
+
+
+

If all dims are to be the same, they can also be given as an +int for either of above invocations

+
mdset = op2.MixedDataSet(mset, dim)
+mdset = op2.MixedDataSet([set1, ..., setN], dim)
+
+
+

Initialized from a MixedSet without explicitly specifying dims +they default to 1

+
mdset = op2.MixedDataSet(mset)
+
+
+

Initialized from an iterable or iterator of DataSets and/or +Sets, where Sets are implicitly upcast to +DataSets of dim 1

+
mdset = op2.MixedDataSet([dset1, ..., dsetN])
+
+
+
+
Parameters:
+
    +
  • arg – a MixedSet or an iterable or a generator +expression of Sets or DataSets or a +mixture of both

  • +
  • dimsNone (the default) or an int or an iterable or +generator expression of ints, which must be +of same length as arg

  • +
+
+
+
+

Warning

+

When using generator expressions for arg or dims, these +must terminate or else will cause an infinite loop.

+
+
+
+split
+

The underlying tuple of DataSets.

+
+ +
+
+dim
+

The shape tuple of the values for each element of the sets.

+
+ +
+
+cdim
+

The sum of the scalar number of values for each member of the sets. +This is the sum of products of the dim tuples.

+
+ +
+
+name
+

Returns the name of the data sets.

+
+ +
+
+set
+

Returns the MixedSet this MixedDataSet is +defined on.

+
+ +
+
+layout_vec
+

A PETSc Vec compatible with the dof layout of this MixedDataSet.

+
+ +
+
+lgmap
+

A PETSc LGMap mapping process-local indices to global +indices for this MixedDataSet.

+
+ +
+
+unblocked_lgmap
+

A PETSc LGMap mapping process-local indices to global +indices for this DataSet with a block size of 1.

+
+ +
+ +
+
+

pyop2.types.glob module

+
+
+class pyop2.types.glob.SetFreeDataCarrier(dim, data=None, dtype=None, name=None)
+

Bases: DataCarrier, EmptyDataMixin

+
+
+property shape
+
+ +
+
+property dtype
+

The Python type of the data.

+
+ +
+
+property data_ro
+

Data array.

+
+ +
+
+property data_wo
+
+ +
+
+property data
+

Data array.

+
+ +
+
+property data_with_halos
+
+ +
+
+property data_ro_with_halos
+
+ +
+
+property data_wo_with_halos
+
+ +
+
+property halo_valid
+
+ +
+
+copy(other, subset=None)
+

Copy the data in this SetFreeDataCarrier into another.

+
+
Parameters:
+
    +
  • other – The destination Global

  • +
  • subset – A Subset of elements to copy (optional)

  • +
+
+
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+property split
+
+ +
+
+property nbytes
+

Return an estimate of the size of the data associated with this +Global in bytes. This will be the correct size of the +data payload, but does not take into account the overhead of +the object and its metadata. This renders this method of +little statistical significance, however it is included to +make the interface consistent.

+
+ +
+
+inner(other)
+
+ +
+ +
+
+class pyop2.types.glob.Global(dim, data=None, dtype=None, name=None, comm=None)
+

Bases: SetFreeDataCarrier, VecAccessMixin

+

OP2 global value.

+

When a Global is passed to a pyop2.op2.par_loop(), the access +descriptor is passed by calling the Global. For example, if +a Global named G is to be accessed for reading, this is +accomplished by:

+
G(pyop2.READ)
+
+
+

It is permissible to pass None as the data argument. In this +case, allocation of the data buffer is postponed until it is +accessed.

+
+

Note

+

If the data buffer is not passed in, it is implicitly +initialised to be zero.

+
+
+
+dataset
+
+ +
+
+duplicate()
+

Return a deep copy of self.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+zero(subset=None)
+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+global_to_local_begin(access_mode)
+

Dummy halo operation for the case in which a Global forms +part of a MixedDat.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+global_to_local_end(access_mode)
+

Dummy halo operation for the case in which a Global forms +part of a MixedDat.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+local_to_global_begin(insert_mode)
+

Dummy halo operation for the case in which a Global forms +part of a MixedDat.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+local_to_global_end(insert_mode)
+

Dummy halo operation for the case in which a Global forms +part of a MixedDat.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+frozen_halo(access_mode)
+

Dummy halo operation for the case in which a Global forms +part of a MixedDat.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+freeze_halo(access_mode)
+

Dummy halo operation for the case in which a Global forms +part of a MixedDat.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+unfreeze_halo()
+

Dummy halo operation for the case in which a Global forms +part of a MixedDat.

+

This function is logically collective over MPI ranks, it is an +error to call it on fewer than all the ranks in MPI communicator.

+
+ +
+
+vec_context(access)
+

A context manager for a PETSc.Vec from a Global.

+
+
Parameters:
+

access – Access descriptor: READ, WRITE, or RW.

+
+
+
+ +
+ +
+
+class pyop2.types.glob.Constant(dim, data=None, dtype=None, name=None, comm=None)
+

Bases: SetFreeDataCarrier

+

OP2 constant value.

+

When a Constant is passed to a pyop2.op2.par_loop(), the access +descriptor is always Access.READ. Used in cases where collective +functionality is not required, or is not desirable. +For example: objects with no associated mesh and do not have a +communicator.

+
+
+duplicate()
+

Return a deep copy of self.

+
+ +
+ +
+
+

pyop2.types.halo module

+
+
+class pyop2.types.halo.Halo
+

Bases: ABC

+

A description of a halo associated with a pyop2.types.set.Set.

+

The halo object describes which pyop2.types.set.Set elements are sent +where, and which pyop2.types.set.Set elements are received from where.

+
+
+abstract property comm
+

The MPI communicator for this halo.

+
+ +
+
+abstract property local_to_global_numbering
+

The mapping from process-local to process-global numbers for this halo.

+
+ +
+
+abstract global_to_local_begin(dat, insert_mode)
+

Begin an exchange from global (assembled) to local (ghosted) representation.

+
+
Parameters:
+
+
+
+
+ +
+
+abstract global_to_local_end(dat, insert_mode)
+

Finish an exchange from global (assembled) to local (ghosted) representation.

+
+
Parameters:
+
+
+
+
+ +
+
+abstract local_to_global_begin(dat, insert_mode)
+

Begin an exchange from local (ghosted) to global (assembled) representation.

+
+
Parameters:
+
+
+
+
+ +
+
+abstract local_to_global_end(dat, insert_mode)
+

Finish an exchange from local (ghosted) to global (assembled) representation.

+
+
Parameters:
+
+
+
+
+ +
+ +
+
+

pyop2.types.map module

+
+
+class pyop2.types.map.Map(iterset, toset, arity, values=None, name=None, offset=None, offset_quotient=None)
+

Bases: object

+

OP2 map, a relation between two Set objects.

+

Each entry in the iterset maps to arity entries in the +toset. When a map is used in a pyop2.op2.par_loop(), it is +possible to use Python index notation to select an individual entry on the +right hand side of this map. There are three possibilities:

+
    +
  • No index. All arity Dat entries will be passed to the +kernel.

  • +
  • An integer: some_map[n]. The n th entry of the +map result will be passed to the kernel.

  • +
+
+
+dtype = dtype('int32')
+
+ +
+
+split
+
+ +
+
+iterset
+

Set mapped from.

+
+ +
+
+toset
+

Set mapped to.

+
+ +
+
+arity
+

Arity of the mapping: number of toset elements mapped to per +iterset element.

+
+ +
+
+arities
+

Arity of the mapping: number of toset elements mapped to per +iterset element.

+
+
Return type:
+

tuple

+
+
+
+ +
+
+arange
+

Tuple of arity offsets for each constituent Map.

+
+ +
+
+values
+

Mapping array.

+

This only returns the map values for local points, to see the +halo points too, use values_with_halo().

+
+ +
+
+values_with_halo
+

Mapping array.

+

This returns all map values (including halo points), see +values() if you only need to look at the local +points.

+
+ +
+
+name
+

User-defined label

+
+ +
+
+offset
+

The vertical offset.

+
+ +
+
+offset_quotient
+

The offset quotient.

+
+ +
+
+flattened_maps
+

Return all component maps.

+

This is useful to flatten nested :class:`ComposedMap`s.

+
+ +
+ +
+
+class pyop2.types.map.PermutedMap(map_, permutation)
+

Bases: Map

+

Composition of a standard Map with a constant permutation.

+
+
Parameters:
+
    +
  • map – The map to permute.

  • +
  • permutation – The permutation of the map indices.

  • +
+
+
+

Where normally staging to element data is performed as

+
local[i] = global[map[i]]
+
+
+

With a PermutedMap we instead get

+
local[i] = global[map[permutation[i]]]
+
+
+

This might be useful if your local kernel wants data in a +different order to the one that the map provides, and you don’t +want two global-sized data structures.

+
+ +
+
+class pyop2.types.map.ComposedMap(*maps_, name=None)
+

Bases: Map

+

Composition of :class:`Map`s, :class:`PermutedMap`s, and/or :class:`ComposedMap`s.

+
+
Parameters:
+

maps – The maps to compose.

+
+
+

Where normally staging to element data is performed as

+
local[i] = global[map[i]]
+
+
+

With a ComposedMap we instead get

+
local[i] = global[maps_[0][maps_[1][maps_[2][...[i]]]]]
+
+
+

This might be useful if the map you want can be represented by +a composition of existing maps.

+
+
+values
+
+ +
+
+values_with_halo
+
+ +
+
+flattened_maps
+
+ +
+ +
+
+class pyop2.types.map.MixedMap(*args, **kwargs)
+

Bases: Map, ObjectCached

+

A container for a bag of Maps.

+
+
Parameters:
+

maps (iterable) – Iterable of Maps

+
+
+
+
+split
+

The underlying tuple of Maps.

+
+ +
+
+iterset
+

MixedSet mapped from.

+
+ +
+
+toset
+

MixedSet mapped to.

+
+ +
+
+arity
+

Arity of the mapping: total number of toset elements mapped to per +iterset element.

+
+ +
+
+arities
+

Arity of the mapping: number of toset elements mapped to per +iterset element.

+
+
Return type:
+

tuple

+
+
+
+ +
+
+arange
+

Tuple of arity offsets for each constituent Map.

+
+ +
+
+values
+

Mapping arrays excluding data for halos.

+

This only returns the map values for local points, to see the +halo points too, use values_with_halo().

+
+ +
+
+values_with_halo
+

Mapping arrays including data for halos.

+

This returns all map values (including halo points), see +values() if you only need to look at the local +points.

+
+ +
+
+name
+

User-defined labels

+
+ +
+
+offset
+

Vertical offsets.

+
+ +
+
+offset_quotient
+

Offsets quotient.

+
+ +
+
+flattened_maps
+
+ +
+ +
+
+

pyop2.types.mat module

+
+
+

pyop2.types.set module

+
+
+class pyop2.types.set.Set(size, name=None, halo=None, comm=None, constrained_size=0)
+

Bases: object

+

OP2 set.

+
+
Parameters:
+
    +
  • size (integer or list of four integers.) – The size of the set.

  • +
  • name (string) – The name of the set (optional).

  • +
  • halo – An exisiting halo to use (optional).

  • +
+
+
+

When the set is employed as an iteration space in a +pyop2.op2.par_loop(), the extent of any local iteration space within +each set entry is indicated in brackets. See the example in +pyop2.op2.par_loop() for more details.

+

The size of the set can either be an integer, or a list of four +integers. The latter case is used for running in parallel where +we distinguish between:

+
+
    +
  • CORE (owned and not touching halo)

  • +
  • OWNED (owned, touching halo)

  • +
  • EXECUTE HALO (not owned, but executed over redundantly)

  • +
  • NON EXECUTE HALO (not owned, read when executing in the execute halo)

  • +
+
+

If a single integer is passed, we assume that we’re running in +serial and there is no distinction.

+

The division of set elements is:

+
[0, CORE)
+[CORE, OWNED)
+[OWNED, GHOST)
+
+
+

Halo send/receive data is stored on sets in a Halo.

+
+
+property indices
+

Returns iterator.

+
+ +
+
+core_size
+

Core set size. Owned elements not touching halo elements.

+
+ +
+
+constrained_size
+
+ +
+
+size
+

Set size, owned elements.

+
+ +
+
+total_size
+

Set size including ghost elements.

+
+ +
+
+sizes
+

Set sizes: core, owned, execute halo, total.

+
+ +
+
+core_part
+
+ +
+
+owned_part
+
+ +
+
+name
+

User-defined label

+
+ +
+
+halo
+

Halo associated with this Set

+
+ +
+
+property partition_size
+

Default partition size

+
+ +
+
+layers
+

Return None (not an ExtrudedSet).

+
+ +
+
+intersection(other)
+
+ +
+
+union(other)
+
+ +
+
+difference(other)
+
+ +
+
+symmetric_difference(other)
+
+ +
+ +
+
+class pyop2.types.set.GlobalSet(comm=None)
+

Bases: Set

+
+
+core_size
+
+ +
+
+size
+
+ +
+
+total_size
+

Total set size, including halo elements.

+
+ +
+
+sizes
+

Set sizes: core, owned, execute halo, total.

+
+ +
+
+name
+

User-defined label

+
+ +
+
+halo
+

Halo associated with this Set

+
+ +
+
+property partition_size
+

Default partition size

+
+ +
+ +
+
+class pyop2.types.set.ExtrudedSet(parent, layers, extruded_periodic=False)
+

Bases: Set

+

OP2 ExtrudedSet.

+
+
Parameters:
+
    +
  • parent (a Set.) – The parent Set to build this ExtrudedSet on top of

  • +
  • layers (an integer, indicating the number of layers for every entity, +or an array of shape (parent.total_size, 2) giving the start +and one past the stop layer for every entity. An entry +a, b = layers[e, ...] means that the layers for entity +e run over [a, b).) – The number of layers in this ExtrudedSet.

  • +
+
+
+

The number of layers indicates the number of time the base set is +extruded in the direction of the ExtrudedSet. As a +result, there are layers-1 extruded “cells” in an extruded set.

+
+
+parent
+
+ +
+
+layers
+

The layers of this extruded set.

+
+ +
+
+layers_array
+
+ +
+ +
+
+class pyop2.types.set.Subset(superset, indices)
+

Bases: ExtrudedSet

+

OP2 subset.

+
+
Parameters:
+
    +
  • superset (a Set or a Subset.) – The superset of the subset.

  • +
  • indices (a list of integers, or a numpy array.) – Elements of the superset that form the +subset. Duplicate values are removed when constructing the subset.

  • +
+
+
+
+
+superset
+

Returns the superset Set

+
+ +
+
+indices
+

Returns the indices pointing in the superset.

+
+ +
+
+owned_indices
+

Return the indices that correspond to the owned entities of the +superset.

+
+ +
+
+layers_array
+
+ +
+
+intersection(other)
+
+ +
+
+union(other)
+
+ +
+
+difference(other)
+
+ +
+
+symmetric_difference(other)
+
+ +
+ +
+
+class pyop2.types.set.SetPartition(set, offset, size)
+

Bases: object

+
+ +
+
+class pyop2.types.set.MixedSet(*args, **kwargs)
+

Bases: Set, ObjectCached

+

A container for a bag of Sets.

+
+
Parameters:
+

sets (iterable) – Iterable of Sets or ExtrudedSets

+
+
+
+
+split
+

The underlying tuple of Sets.

+
+ +
+
+core_size
+

Core set size. Owned elements not touching halo elements.

+
+ +
+
+constrained_size
+

Set size, owned constrained elements.

+
+ +
+
+size
+

Set size, owned elements.

+
+ +
+
+total_size
+

Total set size, including halo elements.

+
+ +
+
+sizes
+

Set sizes: core, owned, execute halo, total.

+
+ +
+
+name
+

User-defined labels.

+
+ +
+
+halo
+

Halos associated with these Sets.

+
+ +
+
+layers
+

Numbers of layers in the extruded mesh (or None if this MixedSet is not extruded).

+
+ +
+ +
+
+

Module contents

+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/search.html b/search.html new file mode 100644 index 000000000..b2b80aeab --- /dev/null +++ b/search.html @@ -0,0 +1,100 @@ + + + + + + + Search — PyOP2 2020.0 documentation + + + + + + + + + + + + + + + + + + + +
+
+
+
+ +

Search

+ + + + +

+ Searching for multiple words only shows matches that contain + all words. +

+ + +
+ + + +
+ + +
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file diff --git a/searchindex.js b/searchindex.js new file mode 100644 index 000000000..e99c51c1f --- /dev/null +++ b/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({"alltitles": {"Access descriptors": [[3, "access-descriptors"]], "Achieving Performance Portability with the IR": [[6, "achieving-performance-portability-with-the-ir"]], "Block Sparsity and Mat": [[9, "block-sparsity-and-mat"]], "Building a sparsity pattern": [[8, "building-a-sparsity-pattern"]], "CUDA backend": [[1, "cuda-backend"]], "Caching in PyOP2": [[2, "caching-in-pyop2"]], "Class caches": [[2, "class-caches"]], "Colouring": [[11, "colouring"]], "Computation-communication Overlap": [[10, "computation-communication-overlap"]], "Consolidating profiles from different runs": [[12, "consolidating-profiles-from-different-runs"]], "Const": [[3, "const"]], "Contents": [[5, "contents"]], "Creating a graph": [[12, "creating-a-graph"]], "Dat": [[3, "dat"]], "Data": [[3, "data"]], "Data layout": [[7, "data-layout"]], "Debugging cache leaks": [[2, "debugging-cache-leaks"]], "Device backends": [[1, "device-backends"]], "Distributed Assembly": [[10, "distributed-assembly"]], "GPU linear algebra": [[8, "gpu-linear-algebra"]], "GPU matrix assembly": [[8, "gpu-matrix-assembly"]], "Global": [[3, "global"]], "Halo exchange": [[10, "halo-exchange"]], "Host backends": [[1, "host-backends"]], "How to select specific kernel optimizations": [[6, "how-to-select-specific-kernel-optimizations"]], "Indices and tables": [[4, "indices-and-tables"]], "Installing PyOP2": [[5, "installing-pyop2"]], "Kernel API": [[7, "kernel-api"]], "Line-by-line profiling": [[12, "line-by-line-profiling"]], "Local Numbering": [[10, "local-numbering"]], "Local Renumbering and Staging": [[11, "local-renumbering-and-staging"]], "Local iteration spaces": [[7, "local-iteration-spaces"]], "Loop invocations": [[3, "loop-invocations"]], "Loops assembling matrices": [[3, "loops-assembling-matrices"]], "Loops with global reductions": [[3, "loops-with-global-reductions"]], "MPI": [[10, "mpi"]], "Mat": [[3, "mat"]], "Matrix assembly": [[8, "matrix-assembly"]], "Memory profiling": [[12, "memory-profiling"]], "Mixed Assembly": [[9, "mixed-assembly"]], "Mixed Set, DataSet, Map and Dat": [[9, "mixed-set-dataset-map-and-dat"]], "Mixed Types": [[9, "mixed-types"]], "Module contents": [[13, "module-contents"], [14, "module-contents"], [15, "module-contents"]], "Multiple Backend Support": [[0, "multiple-backend-support"]], "Object caches": [[2, "object-caches"]], "OpenCL backend": [[1, "opencl-backend"]], "OpenMP backend": [[1, "openmp-backend"]], "Optimizing kernels on CPUs": [[6, "optimizing-kernels-on-cpus"]], "Parallel Execution Plan": [[11, "parallel-execution-plan"]], "Parallel loops": [[3, "parallel-loops"]], "Partitioning": [[11, "partitioning"]], "Profiling": [[12, "profiling"]], "Profiling PyOP2 programs": [[12, "profiling-pyop2-programs"]], "PyOP2 Architecture": [[0, "pyop2-architecture"]], "PyOP2 Backends": [[1, "pyop2-backends"]], "PyOP2 Concepts": [[3, "pyop2-concepts"]], "PyOP2 Kernels": [[7, "pyop2-kernels"]], "PyOP2 Linear Algebra Interface": [[8, "pyop2-linear-algebra-interface"]], "Sequential backend": [[1, "sequential-backend"]], "Sets and mappings": [[3, "sets-and-mappings"]], "Solving a linear system": [[8, "solving-a-linear-system"]], "Sparse Matrix Storage Formats": [[8, "sparse-matrix-storage-formats"]], "Submodules": [[13, "submodules"], [14, "submodules"], [15, "submodules"]], "Subpackages": [[13, "subpackages"]], "The PyOP2 Intermediate Representation": [[6, "the-pyop2-intermediate-representation"]], "Using PyOP2\u2019s internal timers": [[12, "using-pyop2-s-internal-timers"]], "Using the Intermediate Representation": [[6, "using-the-intermediate-representation"]], "Welcome to PyOP2\u2019s documentation!": [[4, "welcome-to-pyop2-s-documentation"]], "pyop2 Package": [[16, "pyop2-package"]], "pyop2 package": [[13, "pyop2-package"]], "pyop2 user documentation": [[16, "pyop2-user-documentation"]], "pyop2.caching module": [[13, "pyop2-caching-module"]], "pyop2.codegen package": [[14, "pyop2-codegen-package"]], "pyop2.codegen.builder module": [[14, "pyop2-codegen-builder-module"]], "pyop2.codegen.loopycompat module": [[14, "pyop2-codegen-loopycompat-module"]], "pyop2.codegen.node module": [[14, "pyop2-codegen-node-module"]], "pyop2.codegen.optimise module": [[14, "pyop2-codegen-optimise-module"]], "pyop2.codegen.rep2loopy module": [[14, "pyop2-codegen-rep2loopy-module"]], "pyop2.codegen.representation module": [[14, "pyop2-codegen-representation-module"]], "pyop2.compilation module": [[13, "pyop2-compilation-module"]], "pyop2.configuration module": [[13, "module-pyop2.configuration"]], "pyop2.datatypes module": [[13, "module-pyop2.datatypes"]], "pyop2.exceptions module": [[13, "module-pyop2.exceptions"]], "pyop2.global_kernel module": [[13, "pyop2-global-kernel-module"]], "pyop2.local_kernel module": [[13, "pyop2-local-kernel-module"]], "pyop2.logger module": [[13, "module-pyop2.logger"]], "pyop2.mpi module": [[13, "module-pyop2.mpi"]], "pyop2.op2 module": [[13, "pyop2-op2-module"]], "pyop2.parloop module": [[13, "pyop2-parloop-module"]], "pyop2.profiling module": [[13, "module-pyop2.profiling"]], "pyop2.sparsity module": [[13, "pyop2-sparsity-module"]], "pyop2.types package": [[15, "pyop2-types-package"]], "pyop2.types.access module": [[15, "module-pyop2.types.access"]], "pyop2.types.dat module": [[15, "module-pyop2.types.dat"]], "pyop2.types.data_carrier module": [[15, "module-pyop2.types.data_carrier"]], "pyop2.types.dataset module": [[15, "module-pyop2.types.dataset"]], "pyop2.types.glob module": [[15, "module-pyop2.types.glob"]], "pyop2.types.halo module": [[15, "module-pyop2.types.halo"]], "pyop2.types.map module": [[15, "module-pyop2.types.map"]], "pyop2.types.mat module": [[15, "pyop2-types-mat-module"]], "pyop2.types.set module": [[15, "module-pyop2.types.set"]], "pyop2.utils module": [[13, "module-pyop2.utils"]], "pyop2.version module": [[13, "pyop2-version-module"]]}, "docnames": ["architecture", "backends", "caching", "concepts", "index", "installation", "ir", "kernels", "linear_algebra", "mixed", "mpi", "plan", "profiling", "pyop2", "pyop2.codegen", "pyop2.types", "user"], "envversion": {"sphinx": 61, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.todo": 2}, "filenames": ["architecture.rst", "backends.rst", "caching.rst", "concepts.rst", "index.rst", "installation.rst", "ir.rst", "kernels.rst", "linear_algebra.rst", "mixed.rst", "mpi.rst", "plan.rst", "profiling.rst", "pyop2.rst", "pyop2.codegen.rst", "pyop2.types.rst", "user.rst"], "indexentries": {"abstractdat (class in pyop2.types.dat)": [[15, "pyop2.types.dat.AbstractDat", false]], "access (class in pyop2.types.access)": [[15, "pyop2.types.access.Access", false]], "align() (in module pyop2.utils)": [[13, "pyop2.utils.align", false]], "arange (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.arange", false]], "arange (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.arange", false]], "arities (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.arities", false]], "arities (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.arities", false]], "arity (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.arity", false]], "arity (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.arity", false]], "aritytypeerror": [[13, "pyop2.exceptions.ArityTypeError", false]], "as_cstr() (in module pyop2.datatypes)": [[13, "pyop2.datatypes.as_cstr", false]], "as_ctypes() (in module pyop2.datatypes)": [[13, "pyop2.datatypes.as_ctypes", false]], "as_numpy_dtype() (in module pyop2.datatypes)": [[13, "pyop2.datatypes.as_numpy_dtype", false]], "as_tuple() (in module pyop2.utils)": [[13, "pyop2.utils.as_tuple", false]], "as_type() (in module pyop2.utils)": [[13, "pyop2.utils.as_type", false]], "cache_dir (pyop2.configuration.configuration attribute)": [[13, "pyop2.configuration.Configuration.cache_dir", false]], "cached_property (class in pyop2.utils)": [[13, "pyop2.utils.cached_property", false]], "cdim (pyop2.types.dat.abstractdat attribute)": [[15, "pyop2.types.dat.AbstractDat.cdim", false]], "cdim (pyop2.types.dat.datview attribute)": [[15, "pyop2.types.dat.DatView.cdim", false]], "cdim (pyop2.types.data_carrier.datacarrier attribute)": [[15, "pyop2.types.data_carrier.DataCarrier.cdim", false]], "cdim (pyop2.types.dataset.dataset attribute)": [[15, "pyop2.types.dataset.DataSet.cdim", false]], "cdim (pyop2.types.dataset.globaldataset attribute)": [[15, "pyop2.types.dataset.GlobalDataSet.cdim", false]], "cdim (pyop2.types.dataset.mixeddataset attribute)": [[15, "pyop2.types.dataset.MixedDataSet.cdim", false]], "check_arg() (pyop2.utils.validate_dtype method)": [[13, "pyop2.utils.validate_dtype.check_arg", false]], "check_arg() (pyop2.utils.validate_in method)": [[13, "pyop2.utils.validate_in.check_arg", false]], "check_arg() (pyop2.utils.validate_range method)": [[13, "pyop2.utils.validate_range.check_arg", false]], "check_arg() (pyop2.utils.validate_type method)": [[13, "pyop2.utils.validate_type.check_arg", false]], "check_args() (pyop2.utils.validate_base method)": [[13, "pyop2.utils.validate_base.check_args", false]], "comm (pyop2.types.halo.halo property)": [[15, "pyop2.types.halo.Halo.comm", false]], "compilationerror": [[13, "pyop2.exceptions.CompilationError", false]], "composedmap (class in pyop2.types.map)": [[15, "pyop2.types.map.ComposedMap", false]], "configuration (class in pyop2.configuration)": [[13, "pyop2.configuration.Configuration", false]], "configurationerror": [[13, "pyop2.exceptions.ConfigurationError", false]], "constant (class in pyop2.types.glob)": [[15, "pyop2.types.glob.Constant", false]], "constrained_size (pyop2.types.set.mixedset attribute)": [[15, "pyop2.types.set.MixedSet.constrained_size", false]], "constrained_size (pyop2.types.set.set attribute)": [[15, "pyop2.types.set.Set.constrained_size", false]], "copy() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.copy", false]], "copy() (pyop2.types.dat.mixeddat method)": [[15, "pyop2.types.dat.MixedDat.copy", false]], "copy() (pyop2.types.glob.setfreedatacarrier method)": [[15, "pyop2.types.glob.SetFreeDataCarrier.copy", false]], "core_part (pyop2.types.set.set attribute)": [[15, "pyop2.types.set.Set.core_part", false]], "core_size (pyop2.types.set.globalset attribute)": [[15, "pyop2.types.set.GlobalSet.core_size", false]], "core_size (pyop2.types.set.mixedset attribute)": [[15, "pyop2.types.set.MixedSet.core_size", false]], "core_size (pyop2.types.set.set attribute)": [[15, "pyop2.types.set.Set.core_size", false]], "ctype (pyop2.types.data_carrier.datacarrier attribute)": [[15, "pyop2.types.data_carrier.DataCarrier.ctype", false]], "dat (class in pyop2.types.dat)": [[15, "pyop2.types.dat.Dat", false]], "dat_version (pyop2.types.dat.mixeddat property)": [[15, "pyop2.types.dat.MixedDat.dat_version", false]], "dat_version (pyop2.types.data_carrier.vecaccessmixin property)": [[15, "pyop2.types.data_carrier.VecAccessMixin.dat_version", false]], "data (pyop2.types.dat.abstractdat property)": [[15, "pyop2.types.dat.AbstractDat.data", false]], "data (pyop2.types.dat.datview property)": [[15, "pyop2.types.dat.DatView.data", false]], "data (pyop2.types.dat.mixeddat property)": [[15, "pyop2.types.dat.MixedDat.data", false]], "data (pyop2.types.glob.setfreedatacarrier property)": [[15, "pyop2.types.glob.SetFreeDataCarrier.data", false]], "data_ro (pyop2.types.dat.abstractdat property)": [[15, "pyop2.types.dat.AbstractDat.data_ro", false]], "data_ro (pyop2.types.dat.datview property)": [[15, "pyop2.types.dat.DatView.data_ro", false]], "data_ro (pyop2.types.dat.mixeddat property)": [[15, "pyop2.types.dat.MixedDat.data_ro", false]], "data_ro (pyop2.types.glob.setfreedatacarrier property)": [[15, "pyop2.types.glob.SetFreeDataCarrier.data_ro", false]], "data_ro_with_halos (pyop2.types.dat.abstractdat property)": [[15, "pyop2.types.dat.AbstractDat.data_ro_with_halos", false]], "data_ro_with_halos (pyop2.types.dat.datview property)": [[15, "pyop2.types.dat.DatView.data_ro_with_halos", false]], "data_ro_with_halos (pyop2.types.dat.mixeddat property)": [[15, "pyop2.types.dat.MixedDat.data_ro_with_halos", false]], "data_ro_with_halos (pyop2.types.glob.setfreedatacarrier property)": [[15, "pyop2.types.glob.SetFreeDataCarrier.data_ro_with_halos", false]], "data_with_halos (pyop2.types.dat.abstractdat property)": [[15, "pyop2.types.dat.AbstractDat.data_with_halos", false]], "data_with_halos (pyop2.types.dat.datview property)": [[15, "pyop2.types.dat.DatView.data_with_halos", false]], "data_with_halos (pyop2.types.dat.mixeddat property)": [[15, "pyop2.types.dat.MixedDat.data_with_halos", false]], "data_with_halos (pyop2.types.glob.setfreedatacarrier property)": [[15, "pyop2.types.glob.SetFreeDataCarrier.data_with_halos", false]], "data_wo (pyop2.types.dat.abstractdat property)": [[15, "pyop2.types.dat.AbstractDat.data_wo", false]], "data_wo (pyop2.types.dat.datview property)": [[15, "pyop2.types.dat.DatView.data_wo", false]], "data_wo (pyop2.types.dat.mixeddat property)": [[15, "pyop2.types.dat.MixedDat.data_wo", false]], "data_wo (pyop2.types.glob.setfreedatacarrier property)": [[15, "pyop2.types.glob.SetFreeDataCarrier.data_wo", false]], "data_wo_with_halos (pyop2.types.dat.abstractdat property)": [[15, "pyop2.types.dat.AbstractDat.data_wo_with_halos", false]], "data_wo_with_halos (pyop2.types.dat.datview property)": [[15, "pyop2.types.dat.DatView.data_wo_with_halos", false]], "data_wo_with_halos (pyop2.types.dat.mixeddat property)": [[15, "pyop2.types.dat.MixedDat.data_wo_with_halos", false]], "data_wo_with_halos (pyop2.types.glob.setfreedatacarrier property)": [[15, "pyop2.types.glob.SetFreeDataCarrier.data_wo_with_halos", false]], "datacarrier (class in pyop2.types.data_carrier)": [[15, "pyop2.types.data_carrier.DataCarrier", false]], "dataset (class in pyop2.types.dataset)": [[15, "pyop2.types.dataset.DataSet", false]], "dataset (pyop2.types.dat.abstractdat attribute)": [[15, "pyop2.types.dat.AbstractDat.dataset", false]], "dataset (pyop2.types.dat.mixeddat attribute)": [[15, "pyop2.types.dat.MixedDat.dataset", false]], "dataset (pyop2.types.glob.global attribute)": [[15, "pyop2.types.glob.Global.dataset", false]], "datasettypeerror": [[13, "pyop2.exceptions.DataSetTypeError", false]], "datatypeerror": [[13, "pyop2.exceptions.DataTypeError", false]], "datavalueerror": [[13, "pyop2.exceptions.DataValueError", false]], "dattypeerror": [[13, "pyop2.exceptions.DatTypeError", false]], "datview (class in pyop2.types.dat)": [[15, "pyop2.types.dat.DatView", false]], "decref() (in module pyop2.mpi)": [[13, "pyop2.mpi.decref", false]], "defaults (pyop2.configuration.configuration attribute)": [[13, "pyop2.configuration.Configuration.DEFAULTS", false]], "difference() (pyop2.types.set.set method)": [[15, "pyop2.types.set.Set.difference", false]], "difference() (pyop2.types.set.subset method)": [[15, "pyop2.types.set.Subset.difference", false]], "dim (pyop2.types.dat.abstractdat attribute)": [[15, "pyop2.types.dat.AbstractDat.dim", false]], "dim (pyop2.types.dat.datview attribute)": [[15, "pyop2.types.dat.DatView.dim", false]], "dim (pyop2.types.data_carrier.datacarrier attribute)": [[15, "pyop2.types.data_carrier.DataCarrier.dim", false]], "dim (pyop2.types.dataset.dataset attribute)": [[15, "pyop2.types.dataset.DataSet.dim", false]], "dim (pyop2.types.dataset.globaldataset attribute)": [[15, "pyop2.types.dataset.GlobalDataSet.dim", false]], "dim (pyop2.types.dataset.mixeddataset attribute)": [[15, "pyop2.types.dataset.MixedDataSet.dim", false]], "dimtypeerror": [[13, "pyop2.exceptions.DimTypeError", false]], "dm (pyop2.types.dataset.dataset attribute)": [[15, "pyop2.types.dataset.DataSet.dm", false]], "dm (pyop2.types.dataset.globaldataset attribute)": [[15, "pyop2.types.dataset.GlobalDataSet.dm", false]], "dtype (pyop2.types.dat.abstractdat attribute)": [[15, "pyop2.types.dat.AbstractDat.dtype", false]], "dtype (pyop2.types.dat.mixeddat attribute)": [[15, "pyop2.types.dat.MixedDat.dtype", false]], "dtype (pyop2.types.data_carrier.datacarrier attribute)": [[15, "pyop2.types.data_carrier.DataCarrier.dtype", false]], "dtype (pyop2.types.glob.setfreedatacarrier property)": [[15, "pyop2.types.glob.SetFreeDataCarrier.dtype", false]], "dtype (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.dtype", false]], "dtype_limits() (in module pyop2.datatypes)": [[13, "pyop2.datatypes.dtype_limits", false]], "duplicate() (pyop2.types.glob.constant method)": [[15, "pyop2.types.glob.Constant.duplicate", false]], "duplicate() (pyop2.types.glob.global method)": [[15, "pyop2.types.glob.Global.duplicate", false]], "emptydatamixin (class in pyop2.types.data_carrier)": [[15, "pyop2.types.data_carrier.EmptyDataMixin", false]], "extrudedset (class in pyop2.types.set)": [[15, "pyop2.types.set.ExtrudedSet", false]], "field_ises (pyop2.types.dataset.dataset attribute)": [[15, "pyop2.types.dataset.DataSet.field_ises", false]], "flatten() (in module pyop2.utils)": [[13, "pyop2.utils.flatten", false]], "flattened_maps (pyop2.types.map.composedmap attribute)": [[15, "pyop2.types.map.ComposedMap.flattened_maps", false]], "flattened_maps (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.flattened_maps", false]], "flattened_maps (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.flattened_maps", false]], "freeze_halo() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.freeze_halo", false]], "freeze_halo() (pyop2.types.dat.mixeddat method)": [[15, "pyop2.types.dat.MixedDat.freeze_halo", false]], "freeze_halo() (pyop2.types.glob.global method)": [[15, "pyop2.types.glob.Global.freeze_halo", false]], "frozen_halo (class in pyop2.types.dat)": [[15, "pyop2.types.dat.frozen_halo", false]], "frozen_halo() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.frozen_halo", false]], "frozen_halo() (pyop2.types.glob.global method)": [[15, "pyop2.types.glob.Global.frozen_halo", false]], "get_petsc_dir() (in module pyop2.utils)": [[13, "pyop2.utils.get_petsc_dir", false]], "global (class in pyop2.types.glob)": [[15, "pyop2.types.glob.Global", false]], "global_to_local_begin() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.global_to_local_begin", false]], "global_to_local_begin() (pyop2.types.dat.mixeddat method)": [[15, "pyop2.types.dat.MixedDat.global_to_local_begin", false]], "global_to_local_begin() (pyop2.types.glob.global method)": [[15, "pyop2.types.glob.Global.global_to_local_begin", false]], "global_to_local_begin() (pyop2.types.halo.halo method)": [[15, "pyop2.types.halo.Halo.global_to_local_begin", false]], "global_to_local_end() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.global_to_local_end", false]], "global_to_local_end() (pyop2.types.dat.mixeddat method)": [[15, "pyop2.types.dat.MixedDat.global_to_local_end", false]], "global_to_local_end() (pyop2.types.glob.global method)": [[15, "pyop2.types.glob.Global.global_to_local_end", false]], "global_to_local_end() (pyop2.types.halo.halo method)": [[15, "pyop2.types.halo.Halo.global_to_local_end", false]], "globaldataset (class in pyop2.types.dataset)": [[15, "pyop2.types.dataset.GlobalDataSet", false]], "globalset (class in pyop2.types.set)": [[15, "pyop2.types.set.GlobalSet", false]], "halo (class in pyop2.types.halo)": [[15, "pyop2.types.halo.Halo", false]], "halo (pyop2.types.set.globalset attribute)": [[15, "pyop2.types.set.GlobalSet.halo", false]], "halo (pyop2.types.set.mixedset attribute)": [[15, "pyop2.types.set.MixedSet.halo", false]], "halo (pyop2.types.set.set attribute)": [[15, "pyop2.types.set.Set.halo", false]], "halo_valid (pyop2.types.dat.datview property)": [[15, "pyop2.types.dat.DatView.halo_valid", false]], "halo_valid (pyop2.types.dat.mixeddat property)": [[15, "pyop2.types.dat.MixedDat.halo_valid", false]], "halo_valid (pyop2.types.glob.setfreedatacarrier property)": [[15, "pyop2.types.glob.SetFreeDataCarrier.halo_valid", false]], "inc (in module pyop2.types.access)": [[15, "pyop2.types.access.INC", false]], "inc (pyop2.types.access.access attribute)": [[15, "pyop2.types.access.Access.INC", false]], "incref() (in module pyop2.mpi)": [[13, "pyop2.mpi.incref", false]], "increment_dat_version() (pyop2.types.dat.mixeddat method)": [[15, "pyop2.types.dat.MixedDat.increment_dat_version", false]], "increment_dat_version() (pyop2.types.data_carrier.datacarrier method)": [[15, "pyop2.types.data_carrier.DataCarrier.increment_dat_version", false]], "indextypeerror": [[13, "pyop2.exceptions.IndexTypeError", false]], "indexvalueerror": [[13, "pyop2.exceptions.IndexValueError", false]], "indices (pyop2.types.set.set property)": [[15, "pyop2.types.set.Set.indices", false]], "indices (pyop2.types.set.subset attribute)": [[15, "pyop2.types.set.Subset.indices", false]], "inner() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.inner", false]], "inner() (pyop2.types.dat.mixeddat method)": [[15, "pyop2.types.dat.MixedDat.inner", false]], "inner() (pyop2.types.glob.setfreedatacarrier method)": [[15, "pyop2.types.glob.SetFreeDataCarrier.inner", false]], "internal_comm() (in module pyop2.mpi)": [[13, "pyop2.mpi.internal_comm", false]], "intersection() (pyop2.types.set.set method)": [[15, "pyop2.types.set.Set.intersection", false]], "intersection() (pyop2.types.set.subset method)": [[15, "pyop2.types.set.Subset.intersection", false]], "is_pyop2_comm() (in module pyop2.mpi)": [[13, "pyop2.mpi.is_pyop2_comm", false]], "iteratevalueerror": [[13, "pyop2.exceptions.IterateValueError", false]], "iterset (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.iterset", false]], "iterset (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.iterset", false]], "kerneltypeerror": [[13, "pyop2.exceptions.KernelTypeError", false]], "layers (pyop2.types.set.extrudedset attribute)": [[15, "pyop2.types.set.ExtrudedSet.layers", false]], "layers (pyop2.types.set.mixedset attribute)": [[15, "pyop2.types.set.MixedSet.layers", false]], "layers (pyop2.types.set.set attribute)": [[15, "pyop2.types.set.Set.layers", false]], "layers_array (pyop2.types.set.extrudedset attribute)": [[15, "pyop2.types.set.ExtrudedSet.layers_array", false]], "layers_array (pyop2.types.set.subset attribute)": [[15, "pyop2.types.set.Subset.layers_array", false]], "layout_vec (pyop2.types.dataset.dataset attribute)": [[15, "pyop2.types.dataset.DataSet.layout_vec", false]], "layout_vec (pyop2.types.dataset.globaldataset attribute)": [[15, "pyop2.types.dataset.GlobalDataSet.layout_vec", false]], "layout_vec (pyop2.types.dataset.mixeddataset attribute)": [[15, "pyop2.types.dataset.MixedDataSet.layout_vec", false]], "lgmap (pyop2.types.dataset.dataset attribute)": [[15, "pyop2.types.dataset.DataSet.lgmap", false]], "lgmap (pyop2.types.dataset.globaldataset attribute)": [[15, "pyop2.types.dataset.GlobalDataSet.lgmap", false]], "lgmap (pyop2.types.dataset.mixeddataset attribute)": [[15, "pyop2.types.dataset.MixedDataSet.lgmap", false]], "load() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.load", false]], "local_ises (pyop2.types.dataset.dataset attribute)": [[15, "pyop2.types.dataset.DataSet.local_ises", false]], "local_ises (pyop2.types.dataset.globaldataset attribute)": [[15, "pyop2.types.dataset.GlobalDataSet.local_ises", false]], "local_to_global_begin() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.local_to_global_begin", false]], "local_to_global_begin() (pyop2.types.dat.mixeddat method)": [[15, "pyop2.types.dat.MixedDat.local_to_global_begin", false]], "local_to_global_begin() (pyop2.types.glob.global method)": [[15, "pyop2.types.glob.Global.local_to_global_begin", false]], "local_to_global_begin() (pyop2.types.halo.halo method)": [[15, "pyop2.types.halo.Halo.local_to_global_begin", false]], "local_to_global_end() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.local_to_global_end", false]], "local_to_global_end() (pyop2.types.dat.mixeddat method)": [[15, "pyop2.types.dat.MixedDat.local_to_global_end", false]], "local_to_global_end() (pyop2.types.glob.global method)": [[15, "pyop2.types.glob.Global.local_to_global_end", false]], "local_to_global_end() (pyop2.types.halo.halo method)": [[15, "pyop2.types.halo.Halo.local_to_global_end", false]], "local_to_global_numbering (pyop2.types.halo.halo property)": [[15, "pyop2.types.halo.Halo.local_to_global_numbering", false]], "log() (in module pyop2.logger)": [[13, "pyop2.logger.log", false]], "map (class in pyop2.types.map)": [[15, "pyop2.types.map.Map", false]], "maptypeerror": [[13, "pyop2.exceptions.MapTypeError", false]], "mapvalueerror": [[13, "pyop2.exceptions.MapValueError", false]], "mattypeerror": [[13, "pyop2.exceptions.MatTypeError", false]], "max (in module pyop2.types.access)": [[15, "pyop2.types.access.MAX", false]], "max (pyop2.types.access.access attribute)": [[15, "pyop2.types.access.Access.MAX", false]], "min (in module pyop2.types.access)": [[15, "pyop2.types.access.MIN", false]], "min (pyop2.types.access.access attribute)": [[15, "pyop2.types.access.Access.MIN", false]], "mixeddat (class in pyop2.types.dat)": [[15, "pyop2.types.dat.MixedDat", false]], "mixeddataset (class in pyop2.types.dataset)": [[15, "pyop2.types.dataset.MixedDataSet", false]], "mixedmap (class in pyop2.types.map)": [[15, "pyop2.types.map.MixedMap", false]], "mixedset (class in pyop2.types.set)": [[15, "pyop2.types.set.MixedSet", false]], "modevalueerror": [[13, "pyop2.exceptions.ModeValueError", false]], "module": [[13, "module-pyop2.configuration", false], [13, "module-pyop2.datatypes", false], [13, "module-pyop2.exceptions", false], [13, "module-pyop2.logger", false], [13, "module-pyop2.mpi", false], [13, "module-pyop2.profiling", false], [13, "module-pyop2.utils", false], [15, "module-pyop2.types.access", false], [15, "module-pyop2.types.dat", false], [15, "module-pyop2.types.data_carrier", false], [15, "module-pyop2.types.dataset", false], [15, "module-pyop2.types.glob", false], [15, "module-pyop2.types.halo", false], [15, "module-pyop2.types.map", false], [15, "module-pyop2.types.set", false]], "name (pyop2.types.data_carrier.datacarrier attribute)": [[15, "pyop2.types.data_carrier.DataCarrier.name", false]], "name (pyop2.types.dataset.dataset attribute)": [[15, "pyop2.types.dataset.DataSet.name", false]], "name (pyop2.types.dataset.globaldataset attribute)": [[15, "pyop2.types.dataset.GlobalDataSet.name", false]], "name (pyop2.types.dataset.mixeddataset attribute)": [[15, "pyop2.types.dataset.MixedDataSet.name", false]], "name (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.name", false]], "name (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.name", false]], "name (pyop2.types.set.globalset attribute)": [[15, "pyop2.types.set.GlobalSet.name", false]], "name (pyop2.types.set.mixedset attribute)": [[15, "pyop2.types.set.MixedSet.name", false]], "name (pyop2.types.set.set attribute)": [[15, "pyop2.types.set.Set.name", false]], "nametypeerror": [[13, "pyop2.exceptions.NameTypeError", false]], "nbytes (pyop2.types.dat.abstractdat attribute)": [[15, "pyop2.types.dat.AbstractDat.nbytes", false]], "nbytes (pyop2.types.dat.mixeddat attribute)": [[15, "pyop2.types.dat.MixedDat.nbytes", false]], "nbytes (pyop2.types.glob.setfreedatacarrier property)": [[15, "pyop2.types.glob.SetFreeDataCarrier.nbytes", false]], "norm (pyop2.types.dat.abstractdat property)": [[15, "pyop2.types.dat.AbstractDat.norm", false]], "offset (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.offset", false]], "offset (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.offset", false]], "offset_quotient (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.offset_quotient", false]], "offset_quotient (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.offset_quotient", false]], "opaquetype (class in pyop2.datatypes)": [[13, "pyop2.datatypes.OpaqueType", false]], "owned_indices (pyop2.types.set.subset attribute)": [[15, "pyop2.types.set.Subset.owned_indices", false]], "owned_part (pyop2.types.set.set attribute)": [[15, "pyop2.types.set.Set.owned_part", false]], "parent (pyop2.types.set.extrudedset attribute)": [[15, "pyop2.types.set.ExtrudedSet.parent", false]], "parse_args() (in module pyop2.utils)": [[13, "pyop2.utils.parse_args", false]], "parser() (in module pyop2.utils)": [[13, "pyop2.utils.parser", false]], "partition_size (pyop2.types.set.globalset property)": [[15, "pyop2.types.set.GlobalSet.partition_size", false]], "partition_size (pyop2.types.set.set property)": [[15, "pyop2.types.set.Set.partition_size", false]], "permutedmap (class in pyop2.types.map)": [[15, "pyop2.types.map.PermutedMap", false]], "progress() (in module pyop2.logger)": [[13, "pyop2.logger.progress", false]], "pyop2.configuration": [[13, "module-pyop2.configuration", false]], "pyop2.datatypes": [[13, "module-pyop2.datatypes", false]], "pyop2.exceptions": [[13, "module-pyop2.exceptions", false]], "pyop2.logger": [[13, "module-pyop2.logger", false]], "pyop2.mpi": [[13, "module-pyop2.mpi", false]], "pyop2.profiling": [[13, "module-pyop2.profiling", false]], "pyop2.types.access": [[15, "module-pyop2.types.access", false]], "pyop2.types.dat": [[15, "module-pyop2.types.dat", false]], "pyop2.types.data_carrier": [[15, "module-pyop2.types.data_carrier", false]], "pyop2.types.dataset": [[15, "module-pyop2.types.dataset", false]], "pyop2.types.glob": [[15, "module-pyop2.types.glob", false]], "pyop2.types.halo": [[15, "module-pyop2.types.halo", false]], "pyop2.types.map": [[15, "module-pyop2.types.map", false]], "pyop2.types.set": [[15, "module-pyop2.types.set", false]], "pyop2.utils": [[13, "module-pyop2.utils", false]], "read (in module pyop2.types.access)": [[15, "pyop2.types.access.READ", false]], "read (pyop2.types.access.access attribute)": [[15, "pyop2.types.access.Access.READ", false]], "reconfigure() (pyop2.configuration.configuration method)": [[13, "pyop2.configuration.Configuration.reconfigure", false]], "reset() (pyop2.configuration.configuration method)": [[13, "pyop2.configuration.Configuration.reset", false]], "rw (in module pyop2.types.access)": [[15, "pyop2.types.access.RW", false]], "rw (pyop2.types.access.access attribute)": [[15, "pyop2.types.access.Access.RW", false]], "save() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.save", false]], "scalar_lgmap (pyop2.types.dataset.dataset attribute)": [[15, "pyop2.types.dataset.DataSet.scalar_lgmap", false]], "set (class in pyop2.types.set)": [[15, "pyop2.types.set.Set", false]], "set (pyop2.types.dataset.dataset attribute)": [[15, "pyop2.types.dataset.DataSet.set", false]], "set (pyop2.types.dataset.globaldataset attribute)": [[15, "pyop2.types.dataset.GlobalDataSet.set", false]], "set (pyop2.types.dataset.mixeddataset attribute)": [[15, "pyop2.types.dataset.MixedDataSet.set", false]], "set_log_level() (in module pyop2.logger)": [[13, "pyop2.logger.set_log_level", false]], "setfreedatacarrier (class in pyop2.types.glob)": [[15, "pyop2.types.glob.SetFreeDataCarrier", false]], "setpartition (class in pyop2.types.set)": [[15, "pyop2.types.set.SetPartition", false]], "settypeerror": [[13, "pyop2.exceptions.SetTypeError", false]], "setvalueerror": [[13, "pyop2.exceptions.SetValueError", false]], "shape (pyop2.types.dat.abstractdat attribute)": [[15, "pyop2.types.dat.AbstractDat.shape", false]], "shape (pyop2.types.dat.datview attribute)": [[15, "pyop2.types.dat.DatView.shape", false]], "shape (pyop2.types.glob.setfreedatacarrier property)": [[15, "pyop2.types.glob.SetFreeDataCarrier.shape", false]], "size (pyop2.types.dataset.globaldataset attribute)": [[15, "pyop2.types.dataset.GlobalDataSet.size", false]], "size (pyop2.types.set.globalset attribute)": [[15, "pyop2.types.set.GlobalSet.size", false]], "size (pyop2.types.set.mixedset attribute)": [[15, "pyop2.types.set.MixedSet.size", false]], "size (pyop2.types.set.set attribute)": [[15, "pyop2.types.set.Set.size", false]], "sizes (pyop2.types.set.globalset attribute)": [[15, "pyop2.types.set.GlobalSet.sizes", false]], "sizes (pyop2.types.set.mixedset attribute)": [[15, "pyop2.types.set.MixedSet.sizes", false]], "sizes (pyop2.types.set.set attribute)": [[15, "pyop2.types.set.Set.sizes", false]], "sizetypeerror": [[13, "pyop2.exceptions.SizeTypeError", false]], "sparsityformaterror": [[13, "pyop2.exceptions.SparsityFormatError", false]], "sparsitytypeerror": [[13, "pyop2.exceptions.SparsityTypeError", false]], "split (pyop2.types.dat.abstractdat attribute)": [[15, "pyop2.types.dat.AbstractDat.split", false]], "split (pyop2.types.dat.mixeddat attribute)": [[15, "pyop2.types.dat.MixedDat.split", false]], "split (pyop2.types.dataset.mixeddataset attribute)": [[15, "pyop2.types.dataset.MixedDataSet.split", false]], "split (pyop2.types.glob.setfreedatacarrier property)": [[15, "pyop2.types.glob.SetFreeDataCarrier.split", false]], "split (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.split", false]], "split (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.split", false]], "split (pyop2.types.set.mixedset attribute)": [[15, "pyop2.types.set.MixedSet.split", false]], "strip() (in module pyop2.utils)": [[13, "pyop2.utils.strip", false]], "subset (class in pyop2.types.set)": [[15, "pyop2.types.set.Subset", false]], "subsetindexoutofbounds": [[13, "pyop2.exceptions.SubsetIndexOutOfBounds", false]], "superset (pyop2.types.set.subset attribute)": [[15, "pyop2.types.set.Subset.superset", false]], "symmetric_difference() (pyop2.types.set.set method)": [[15, "pyop2.types.set.Set.symmetric_difference", false]], "symmetric_difference() (pyop2.types.set.subset method)": [[15, "pyop2.types.set.Subset.symmetric_difference", false]], "temp_internal_comm (class in pyop2.mpi)": [[13, "pyop2.mpi.temp_internal_comm", false]], "timed_function (class in pyop2.profiling)": [[13, "pyop2.profiling.timed_function", false]], "timed_region() (in module pyop2.profiling)": [[13, "pyop2.profiling.timed_region", false]], "timed_stage() (in module pyop2.profiling)": [[13, "pyop2.profiling.timed_stage", false]], "toset (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.toset", false]], "toset (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.toset", false]], "total_size (pyop2.types.set.globalset attribute)": [[15, "pyop2.types.set.GlobalSet.total_size", false]], "total_size (pyop2.types.set.mixedset attribute)": [[15, "pyop2.types.set.MixedSet.total_size", false]], "total_size (pyop2.types.set.set attribute)": [[15, "pyop2.types.set.Set.total_size", false]], "trim() (in module pyop2.utils)": [[13, "pyop2.utils.trim", false]], "tuplify() (in module pyop2.utils)": [[13, "pyop2.utils.tuplify", false]], "unblocked_lgmap (pyop2.types.dataset.dataset attribute)": [[15, "pyop2.types.dataset.DataSet.unblocked_lgmap", false]], "unblocked_lgmap (pyop2.types.dataset.globaldataset attribute)": [[15, "pyop2.types.dataset.GlobalDataSet.unblocked_lgmap", false]], "unblocked_lgmap (pyop2.types.dataset.mixeddataset attribute)": [[15, "pyop2.types.dataset.MixedDataSet.unblocked_lgmap", false]], "unfreeze_halo() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.unfreeze_halo", false]], "unfreeze_halo() (pyop2.types.dat.mixeddat method)": [[15, "pyop2.types.dat.MixedDat.unfreeze_halo", false]], "unfreeze_halo() (pyop2.types.glob.global method)": [[15, "pyop2.types.glob.Global.unfreeze_halo", false]], "union() (pyop2.types.set.set method)": [[15, "pyop2.types.set.Set.union", false]], "union() (pyop2.types.set.subset method)": [[15, "pyop2.types.set.Subset.union", false]], "unsafe_reconfigure() (pyop2.configuration.configuration method)": [[13, "pyop2.configuration.Configuration.unsafe_reconfigure", false]], "validate_base (class in pyop2.utils)": [[13, "pyop2.utils.validate_base", false]], "validate_dtype (class in pyop2.utils)": [[13, "pyop2.utils.validate_dtype", false]], "validate_in (class in pyop2.utils)": [[13, "pyop2.utils.validate_in", false]], "validate_range (class in pyop2.utils)": [[13, "pyop2.utils.validate_range", false]], "validate_type (class in pyop2.utils)": [[13, "pyop2.utils.validate_type", false]], "values (pyop2.types.map.composedmap attribute)": [[15, "pyop2.types.map.ComposedMap.values", false]], "values (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.values", false]], "values (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.values", false]], "values_with_halo (pyop2.types.map.composedmap attribute)": [[15, "pyop2.types.map.ComposedMap.values_with_halo", false]], "values_with_halo (pyop2.types.map.map attribute)": [[15, "pyop2.types.map.Map.values_with_halo", false]], "values_with_halo (pyop2.types.map.mixedmap attribute)": [[15, "pyop2.types.map.MixedMap.values_with_halo", false]], "vec (pyop2.types.data_carrier.vecaccessmixin property)": [[15, "pyop2.types.data_carrier.VecAccessMixin.vec", false]], "vec_context() (pyop2.types.dat.dat method)": [[15, "pyop2.types.dat.Dat.vec_context", false]], "vec_context() (pyop2.types.dat.mixeddat method)": [[15, "pyop2.types.dat.MixedDat.vec_context", false]], "vec_context() (pyop2.types.data_carrier.vecaccessmixin method)": [[15, "pyop2.types.data_carrier.VecAccessMixin.vec_context", false]], "vec_context() (pyop2.types.glob.global method)": [[15, "pyop2.types.glob.Global.vec_context", false]], "vec_ro (pyop2.types.data_carrier.vecaccessmixin property)": [[15, "pyop2.types.data_carrier.VecAccessMixin.vec_ro", false]], "vec_wo (pyop2.types.data_carrier.vecaccessmixin property)": [[15, "pyop2.types.data_carrier.VecAccessMixin.vec_wo", false]], "vecaccessmixin (class in pyop2.types.data_carrier)": [[15, "pyop2.types.data_carrier.VecAccessMixin", false]], "verify_reshape() (in module pyop2.utils)": [[13, "pyop2.utils.verify_reshape", false]], "write (in module pyop2.types.access)": [[15, "pyop2.types.access.WRITE", false]], "write (pyop2.types.access.access attribute)": [[15, "pyop2.types.access.Access.WRITE", false]], "zero() (pyop2.types.dat.abstractdat method)": [[15, "pyop2.types.dat.AbstractDat.zero", false]], "zero() (pyop2.types.dat.mixeddat method)": [[15, "pyop2.types.dat.MixedDat.zero", false]], "zero() (pyop2.types.glob.global method)": [[15, "pyop2.types.glob.Global.zero", false]]}, "objects": {"pyop2": [[13, 0, 0, "-", "configuration"], [13, 0, 0, "-", "datatypes"], [13, 0, 0, "-", "exceptions"], [13, 0, 0, "-", "logger"], [13, 0, 0, "-", "mpi"], [13, 0, 0, "-", "profiling"], [13, 0, 0, "-", "utils"]], "pyop2.configuration": [[13, 1, 1, "", "Configuration"]], "pyop2.configuration.Configuration": [[13, 2, 1, "", "DEFAULTS"], [13, 2, 1, "", "cache_dir"], [13, 3, 1, "", "reconfigure"], [13, 3, 1, "", "reset"], [13, 3, 1, "", "unsafe_reconfigure"]], "pyop2.datatypes": [[13, 1, 1, "", "OpaqueType"], [13, 4, 1, "", "as_cstr"], [13, 4, 1, "", "as_ctypes"], [13, 4, 1, "", "as_numpy_dtype"], [13, 4, 1, "", "dtype_limits"]], "pyop2.exceptions": [[13, 5, 1, "", "ArityTypeError"], [13, 5, 1, "", "CompilationError"], [13, 5, 1, "", "ConfigurationError"], [13, 5, 1, "", "DatTypeError"], [13, 5, 1, "", "DataSetTypeError"], [13, 5, 1, "", "DataTypeError"], [13, 5, 1, "", "DataValueError"], [13, 5, 1, "", "DimTypeError"], [13, 5, 1, "", "IndexTypeError"], [13, 5, 1, "", "IndexValueError"], [13, 5, 1, "", "IterateValueError"], [13, 5, 1, "", "KernelTypeError"], [13, 5, 1, "", "MapTypeError"], [13, 5, 1, "", "MapValueError"], [13, 5, 1, "", "MatTypeError"], [13, 5, 1, "", "ModeValueError"], [13, 5, 1, "", "NameTypeError"], [13, 5, 1, "", "SetTypeError"], [13, 5, 1, "", "SetValueError"], [13, 5, 1, "", "SizeTypeError"], [13, 5, 1, "", "SparsityFormatError"], [13, 5, 1, "", "SparsityTypeError"], [13, 5, 1, "", "SubsetIndexOutOfBounds"]], "pyop2.logger": [[13, 4, 1, "", "log"], [13, 4, 1, "", "progress"], [13, 4, 1, "", "set_log_level"]], "pyop2.mpi": [[13, 4, 1, "", "decref"], [13, 4, 1, "", "incref"], [13, 4, 1, "", "internal_comm"], [13, 4, 1, "", "is_pyop2_comm"], [13, 1, 1, "", "temp_internal_comm"]], "pyop2.profiling": [[13, 1, 1, "", "timed_function"], [13, 4, 1, "", "timed_region"], [13, 4, 1, "", "timed_stage"]], "pyop2.types": [[15, 0, 0, "-", "access"], [15, 0, 0, "-", "dat"], [15, 0, 0, "-", "data_carrier"], [15, 0, 0, "-", "dataset"], [15, 0, 0, "-", "glob"], [15, 0, 0, "-", "halo"], [15, 0, 0, "-", "map"], [15, 0, 0, "-", "set"]], "pyop2.types.access": [[15, 1, 1, "", "Access"], [15, 6, 1, "", "INC"], [15, 6, 1, "", "MAX"], [15, 6, 1, "", "MIN"], [15, 6, 1, "", "READ"], [15, 6, 1, "", "RW"], [15, 6, 1, "", "WRITE"]], "pyop2.types.access.Access": [[15, 2, 1, "", "INC"], [15, 2, 1, "", "MAX"], [15, 2, 1, "", "MIN"], [15, 2, 1, "", "READ"], [15, 2, 1, "", "RW"], [15, 2, 1, "", "WRITE"]], "pyop2.types.dat": [[15, 1, 1, "", "AbstractDat"], [15, 1, 1, "", "Dat"], [15, 1, 1, "", "DatView"], [15, 1, 1, "", "MixedDat"], [15, 1, 1, "", "frozen_halo"]], "pyop2.types.dat.AbstractDat": [[15, 2, 1, "", "cdim"], [15, 3, 1, "", "copy"], [15, 7, 1, "", "data"], [15, 7, 1, "", "data_ro"], [15, 7, 1, "", "data_ro_with_halos"], [15, 7, 1, "", "data_with_halos"], [15, 7, 1, "", "data_wo"], [15, 7, 1, "", "data_wo_with_halos"], [15, 2, 1, "", "dataset"], [15, 2, 1, "", "dim"], [15, 2, 1, "", "dtype"], [15, 3, 1, "", "freeze_halo"], [15, 3, 1, "", "frozen_halo"], [15, 3, 1, "", "global_to_local_begin"], [15, 3, 1, "", "global_to_local_end"], [15, 3, 1, "", "inner"], [15, 3, 1, "", "load"], [15, 3, 1, "", "local_to_global_begin"], [15, 3, 1, "", "local_to_global_end"], [15, 2, 1, "", "nbytes"], [15, 7, 1, "", "norm"], [15, 3, 1, "", "save"], [15, 2, 1, "", "shape"], [15, 2, 1, "", "split"], [15, 3, 1, "", "unfreeze_halo"], [15, 3, 1, "", "zero"]], "pyop2.types.dat.Dat": [[15, 3, 1, "", "vec_context"]], "pyop2.types.dat.DatView": [[15, 2, 1, "", "cdim"], [15, 7, 1, "", "data"], [15, 7, 1, "", "data_ro"], [15, 7, 1, "", "data_ro_with_halos"], [15, 7, 1, "", "data_with_halos"], [15, 7, 1, "", "data_wo"], [15, 7, 1, "", "data_wo_with_halos"], [15, 2, 1, "", "dim"], [15, 7, 1, "", "halo_valid"], [15, 2, 1, "", "shape"]], "pyop2.types.dat.MixedDat": [[15, 3, 1, "", "copy"], [15, 7, 1, "", "dat_version"], [15, 7, 1, "", "data"], [15, 7, 1, "", "data_ro"], [15, 7, 1, "", "data_ro_with_halos"], [15, 7, 1, "", "data_with_halos"], [15, 7, 1, "", "data_wo"], [15, 7, 1, "", "data_wo_with_halos"], [15, 2, 1, "", "dataset"], [15, 2, 1, "", "dtype"], [15, 3, 1, "", "freeze_halo"], [15, 3, 1, "", "global_to_local_begin"], [15, 3, 1, "", "global_to_local_end"], [15, 7, 1, "", "halo_valid"], [15, 3, 1, "", "increment_dat_version"], [15, 3, 1, "", "inner"], [15, 3, 1, "", "local_to_global_begin"], [15, 3, 1, "", "local_to_global_end"], [15, 2, 1, "", "nbytes"], [15, 2, 1, "", "split"], [15, 3, 1, "", "unfreeze_halo"], [15, 3, 1, "", "vec_context"], [15, 3, 1, "", "zero"]], "pyop2.types.data_carrier": [[15, 1, 1, "", "DataCarrier"], [15, 1, 1, "", "EmptyDataMixin"], [15, 1, 1, "", "VecAccessMixin"]], "pyop2.types.data_carrier.DataCarrier": [[15, 2, 1, "", "cdim"], [15, 2, 1, "", "ctype"], [15, 2, 1, "", "dim"], [15, 2, 1, "", "dtype"], [15, 3, 1, "", "increment_dat_version"], [15, 2, 1, "", "name"]], "pyop2.types.data_carrier.VecAccessMixin": [[15, 7, 1, "", "dat_version"], [15, 7, 1, "", "vec"], [15, 3, 1, "", "vec_context"], [15, 7, 1, "", "vec_ro"], [15, 7, 1, "", "vec_wo"]], "pyop2.types.dataset": [[15, 1, 1, "", "DataSet"], [15, 1, 1, "", "GlobalDataSet"], [15, 1, 1, "", "MixedDataSet"]], "pyop2.types.dataset.DataSet": [[15, 2, 1, "", "cdim"], [15, 2, 1, "", "dim"], [15, 2, 1, "", "dm"], [15, 2, 1, "", "field_ises"], [15, 2, 1, "", "layout_vec"], [15, 2, 1, "", "lgmap"], [15, 2, 1, "", "local_ises"], [15, 2, 1, "", "name"], [15, 2, 1, "", "scalar_lgmap"], [15, 2, 1, "", "set"], [15, 2, 1, "", "unblocked_lgmap"]], "pyop2.types.dataset.GlobalDataSet": [[15, 2, 1, "", "cdim"], [15, 2, 1, "", "dim"], [15, 2, 1, "", "dm"], [15, 2, 1, "", "layout_vec"], [15, 2, 1, "", "lgmap"], [15, 2, 1, "", "local_ises"], [15, 2, 1, "", "name"], [15, 2, 1, "", "set"], [15, 2, 1, "", "size"], [15, 2, 1, "", "unblocked_lgmap"]], "pyop2.types.dataset.MixedDataSet": [[15, 2, 1, "", "cdim"], [15, 2, 1, "", "dim"], [15, 2, 1, "", "layout_vec"], [15, 2, 1, "", "lgmap"], [15, 2, 1, "", "name"], [15, 2, 1, "", "set"], [15, 2, 1, "", "split"], [15, 2, 1, "", "unblocked_lgmap"]], "pyop2.types.glob": [[15, 1, 1, "", "Constant"], [15, 1, 1, "", "Global"], [15, 1, 1, "", "SetFreeDataCarrier"]], "pyop2.types.glob.Constant": [[15, 3, 1, "", "duplicate"]], "pyop2.types.glob.Global": [[15, 2, 1, "", "dataset"], [15, 3, 1, "", "duplicate"], [15, 3, 1, "", "freeze_halo"], [15, 3, 1, "", "frozen_halo"], [15, 3, 1, "", "global_to_local_begin"], [15, 3, 1, "", "global_to_local_end"], [15, 3, 1, "", "local_to_global_begin"], [15, 3, 1, "", "local_to_global_end"], [15, 3, 1, "", "unfreeze_halo"], [15, 3, 1, "", "vec_context"], [15, 3, 1, "", "zero"]], "pyop2.types.glob.SetFreeDataCarrier": [[15, 3, 1, "", "copy"], [15, 7, 1, "", "data"], [15, 7, 1, "", "data_ro"], [15, 7, 1, "", "data_ro_with_halos"], [15, 7, 1, "", "data_with_halos"], [15, 7, 1, "", "data_wo"], [15, 7, 1, "", "data_wo_with_halos"], [15, 7, 1, "", "dtype"], [15, 7, 1, "", "halo_valid"], [15, 3, 1, "", "inner"], [15, 7, 1, "", "nbytes"], [15, 7, 1, "", "shape"], [15, 7, 1, "", "split"]], "pyop2.types.halo": [[15, 1, 1, "", "Halo"]], "pyop2.types.halo.Halo": [[15, 7, 1, "", "comm"], [15, 3, 1, "", "global_to_local_begin"], [15, 3, 1, "", "global_to_local_end"], [15, 3, 1, "", "local_to_global_begin"], [15, 3, 1, "", "local_to_global_end"], [15, 7, 1, "", "local_to_global_numbering"]], "pyop2.types.map": [[15, 1, 1, "", "ComposedMap"], [15, 1, 1, "", "Map"], [15, 1, 1, "", "MixedMap"], [15, 1, 1, "", "PermutedMap"]], "pyop2.types.map.ComposedMap": [[15, 2, 1, "", "flattened_maps"], [15, 2, 1, "", "values"], [15, 2, 1, "", "values_with_halo"]], "pyop2.types.map.Map": [[15, 2, 1, "", "arange"], [15, 2, 1, "", "arities"], [15, 2, 1, "", "arity"], [15, 2, 1, "", "dtype"], [15, 2, 1, "", "flattened_maps"], [15, 2, 1, "", "iterset"], [15, 2, 1, "", "name"], [15, 2, 1, "", "offset"], [15, 2, 1, "", "offset_quotient"], [15, 2, 1, "", "split"], [15, 2, 1, "", "toset"], [15, 2, 1, "", "values"], [15, 2, 1, "", "values_with_halo"]], "pyop2.types.map.MixedMap": [[15, 2, 1, "", "arange"], [15, 2, 1, "", "arities"], [15, 2, 1, "", "arity"], [15, 2, 1, "", "flattened_maps"], [15, 2, 1, "", "iterset"], [15, 2, 1, "", "name"], [15, 2, 1, "", "offset"], [15, 2, 1, "", "offset_quotient"], [15, 2, 1, "", "split"], [15, 2, 1, "", "toset"], [15, 2, 1, "", "values"], [15, 2, 1, "", "values_with_halo"]], "pyop2.types.set": [[15, 1, 1, "", "ExtrudedSet"], [15, 1, 1, "", "GlobalSet"], [15, 1, 1, "", "MixedSet"], [15, 1, 1, "", "Set"], [15, 1, 1, "", "SetPartition"], [15, 1, 1, "", "Subset"]], "pyop2.types.set.ExtrudedSet": [[15, 2, 1, "", "layers"], [15, 2, 1, "", "layers_array"], [15, 2, 1, "", "parent"]], "pyop2.types.set.GlobalSet": [[15, 2, 1, "", "core_size"], [15, 2, 1, "", "halo"], [15, 2, 1, "", "name"], [15, 7, 1, "", "partition_size"], [15, 2, 1, "", "size"], [15, 2, 1, "", "sizes"], [15, 2, 1, "", "total_size"]], "pyop2.types.set.MixedSet": [[15, 2, 1, "", "constrained_size"], [15, 2, 1, "", "core_size"], [15, 2, 1, "", "halo"], [15, 2, 1, "", "layers"], [15, 2, 1, "", "name"], [15, 2, 1, "", "size"], [15, 2, 1, "", "sizes"], [15, 2, 1, "", "split"], [15, 2, 1, "", "total_size"]], "pyop2.types.set.Set": [[15, 2, 1, "", "constrained_size"], [15, 2, 1, "", "core_part"], [15, 2, 1, "", "core_size"], [15, 3, 1, "", "difference"], [15, 2, 1, "", "halo"], [15, 7, 1, "", "indices"], [15, 3, 1, "", "intersection"], [15, 2, 1, "", "layers"], [15, 2, 1, "", "name"], [15, 2, 1, "", "owned_part"], [15, 7, 1, "", "partition_size"], [15, 2, 1, "", "size"], [15, 2, 1, "", "sizes"], [15, 3, 1, "", "symmetric_difference"], [15, 2, 1, "", "total_size"], [15, 3, 1, "", "union"]], "pyop2.types.set.Subset": [[15, 3, 1, "", "difference"], [15, 2, 1, "", "indices"], [15, 3, 1, "", "intersection"], [15, 2, 1, "", "layers_array"], [15, 2, 1, "", "owned_indices"], [15, 2, 1, "", "superset"], [15, 3, 1, "", "symmetric_difference"], [15, 3, 1, "", "union"]], "pyop2.utils": [[13, 4, 1, "", "align"], [13, 4, 1, "", "as_tuple"], [13, 4, 1, "", "as_type"], [13, 1, 1, "", "cached_property"], [13, 4, 1, "", "flatten"], [13, 4, 1, "", "get_petsc_dir"], [13, 4, 1, "", "parse_args"], [13, 4, 1, "", "parser"], [13, 4, 1, "", "strip"], [13, 4, 1, "", "trim"], [13, 4, 1, "", "tuplify"], [13, 1, 1, "", "validate_base"], [13, 1, 1, "", "validate_dtype"], [13, 1, 1, "", "validate_in"], [13, 1, 1, "", "validate_range"], [13, 1, 1, "", "validate_type"], [13, 4, 1, "", "verify_reshape"]], "pyop2.utils.validate_base": [[13, 3, 1, "", "check_args"]], "pyop2.utils.validate_dtype": [[13, 3, 1, "", "check_arg"]], "pyop2.utils.validate_in": [[13, 3, 1, "", "check_arg"]], "pyop2.utils.validate_range": [[13, 3, 1, "", "check_arg"]], "pyop2.utils.validate_type": [[13, 3, 1, "", "check_arg"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "method", "Python method"], "4": ["py", "function", "Python function"], "5": ["py", "exception", "Python exception"], "6": ["py", "data", "Python data"], "7": ["py", "property", "Python property"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:attribute", "3": "py:method", "4": "py:function", "5": "py:exception", "6": "py:data", "7": "py:property"}, "terms": {"": [0, 1, 2, 6, 8, 9, 10, 15], "0": [1, 3, 6, 7, 8, 9, 10, 12, 15], "04": 5, "1": [1, 2, 3, 6, 7, 8, 9, 10, 12, 13, 15], "10": 2, "12": 7, "12x12": 7, "14": 3, "15": 1, "16": 13, "18": 5, "1e": 3, "1x1": 9, "1x2": 9, "2": [1, 3, 6, 7, 8, 9, 12, 13, 15], "257": 13, "2x1": 9, "2x2": [3, 9], "3": [1, 3, 5, 6, 7, 8, 9, 12, 13, 15], "30": 1, "32": [1, 11], "3x3": 9, "4": [3, 7, 13, 15], "41": 1, "5": [7, 12, 15], "6": [5, 7, 15], "64": [1, 5, 11], "668": 1, "8": [6, 13], "8x8": 6, "9": 8, "A": [0, 1, 3, 6, 7, 8, 9, 10, 11, 12, 13, 15], "As": [0, 1, 2, 6, 7, 8, 15], "At": [8, 11], "Being": 6, "By": [6, 10], "For": [0, 2, 3, 6, 7, 8, 9, 10, 11, 15], "ISes": 15, "If": [0, 1, 6, 7, 10, 11, 13, 15], "In": [1, 2, 3, 6, 7, 8, 9, 15], "It": [1, 3, 4, 6, 7, 11, 15], "No": [6, 15], "Not": 3, "Of": [3, 6], "On": [2, 6, 7], "One": [3, 6], "That": 6, "The": [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 15], "Then": 6, "There": [1, 2, 3, 6, 7, 9, 10, 12, 15], "These": [1, 2, 8], "To": [1, 2, 3, 6, 7, 10, 11, 12], "With": [12, 15], "_": 13, "__atomic_add": 8, "__attribute__": 1, "__b": 1, "__builtins__": 12, "__device__": 1, "__doc__": 12, "__global": 1, "__global__": [1, 8], "__kernel": 1, "__lma_to_csr": 8, "__local": 1, "__mat_kernel_stub": 8, "__midpoint_stub": 1, "__privat": 1, "__shared__": 1, "__syncthread": 1, "_arg0_0": 1, "_arg1_0": 1, "_arg1_0_map0_0": 1, "_blkmap": 1, "_boffset": 1, "_cach": 2, "_data": 15, "_end": 1, "_nblock": 1, "_nelem": 1, "_offset": 1, "_openmp": 1, "_start": 1, "abc": 15, "abl": [1, 10], "about": [2, 10], "abov": [1, 2, 3, 7, 8, 10, 15], "absolut": 2, "abstract": [6, 15], "abstractdat": [13, 15], "acceler": 6, "access": [1, 6, 7, 9, 10, 11, 13], "access_mod": 15, "accessor": 15, "accomplish": 15, "accord": [0, 6, 10, 13], "account": [10, 15], "accss": 8, "accumul": 3, "accur": 4, "achiev": 4, "across": 1, "act": 15, "action": 1, "activ": 6, "active_threads_count": 1, "actual": [0, 1, 7, 10, 12, 13, 15], "ad": [6, 8], "add": [3, 8, 12], "addit": [3, 7, 12], "addto_vector": 8, "adjust": 6, "advantag": 6, "advers": 2, "affect": 2, "after": [1, 3, 7, 10], "afterward": [1, 8], "again": [0, 2, 6, 8, 10, 11, 12], "against": 15, "aggreg": 12, "aggress": 6, "agre": 7, "aim": 6, "ainv": 8, "algebra": [1, 4, 6, 15], "algorithm": [3, 8, 11], "align": [1, 6, 13], "all": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 13, 15], "alloc": [1, 8, 11, 15], "allow": [0, 3, 6, 7, 8, 9, 10, 15], "allow_non": 13, "allreduc": 13, "almost": [1, 6], "along": 6, "alreadi": [1, 6, 8, 15], "also": [1, 5, 6, 9, 10, 12, 15], "alwai": [6, 7, 15], "amg": 8, "among": [1, 6, 7, 8, 10], "amount": [6, 7, 11], "an": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 15], "analys": 6, "ani": [0, 1, 2, 3, 6, 7, 10, 12, 13, 15], "annot": [1, 12], "anonym": [3, 7], "anoth": [1, 3, 6, 11, 13, 15], "ap": 6, "api": [0, 2, 4, 6, 13], "appli": [0, 1, 3, 6, 9], "applic": [0, 3, 6, 11, 12], "approach": [1, 6], "appropri": [1, 13, 15], "approxim": 8, "ar": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 15], "arang": [13, 15], "architectur": [1, 4, 6, 7], "area": 1, "arg": [13, 15], "arg0": [1, 8], "arg0_0": 1, "arg0_0_0": 8, "arg0_0_map0_0": 8, "arg0_0_map1_0": 8, "arg0_offset": 8, "arg1_0": 1, "arg1_0_map0_0": 1, "arg1_0_vec": 1, "argpars": 13, "argtyp": 13, "argument": [0, 1, 2, 3, 6, 7, 8, 10, 11, 13, 15], "argumentpars": 13, "argv": 12, "aris": 9, "ariti": [1, 2, 3, 7, 8, 13, 15], "aritytypeerror": 13, "around": [2, 8], "arrai": [1, 6, 7, 8, 11, 15], "arrang": 13, "as_cstr": 13, "as_ctyp": 13, "as_numpy_dtyp": 13, "as_tupl": 13, "as_typ": 13, "ask": 6, "assembl": [8, 9, 10, 11, 15], "assembli": [0, 3, 4, 6, 7, 15], "assert": 2, "assign": 6, "associ": [1, 3, 9, 15], "assum": [3, 6, 10, 15], "ast": 6, "ast_bas": 6, "ast_bodi": 6, "ast_plan": 6, "asymptot": 8, "attempt": 13, "attent": 7, "attribut": [1, 2, 6, 8, 9, 13], "autom": 6, "automat": [1, 3, 6, 7, 8, 12], "autotun": 6, "autovect": 6, "autovector": 6, "av": 6, "avail": [6, 11, 13], "averag": 12, "avoid": [1, 10, 11, 15], "avx2": 13, "avx512": 13, "awai": 2, "axiom": 2, "b": [3, 8, 15], "back": [1, 7, 15], "backend": [4, 6, 8, 10, 11], "bag": 15, "bandwidth": 10, "barrier": 1, "base": [1, 2, 6, 8, 9, 13, 15], "basenam": 12, "basi": 7, "basic": 6, "bear": 1, "becaus": [6, 10, 15], "becom": 2, "been": [2, 3, 6, 7, 8, 10, 11], "befor": [1, 2, 3, 10, 12], "begin": [7, 15], "behalf": 3, "behav": 9, "being": [2, 3, 7, 13], "belong": 10, "below": [0, 1, 3, 8, 10, 12], "benefit": 6, "best": 6, "better": 6, "between": [3, 6, 7, 10, 15], "bicgstab": 8, "biconjug": 8, "bid": 1, "bin": 13, "binari": 8, "bit": [5, 11], "black": 1, "blkmap": 1, "block": [1, 4, 6, 13, 15], "block_id": 1, "block_offset": 1, "block_spars": 13, "blockdim": [1, 8], "blockid": 1, "blockidx": [1, 8], "blue": 1, "bodi": 6, "boffset": 1, "bookkeep": 15, "bool": 13, "both": [1, 2, 6, 9, 15], "bottom": 9, "bound": [6, 13], "boundari": 10, "bracket": 15, "branch": 10, "breviti": 8, "bridson": 8, "buffer": [8, 15], "buffer_arg0_0": 8, "build": [0, 1, 2, 3, 4, 6, 9, 12, 15], "builder": 13, "built": [1, 2, 3, 6, 8, 13], "builtin": 12, "byte": [1, 13, 15], "c": [1, 6, 7, 13, 15], "c99": 7, "c_for": 6, "c_sym": 6, "cach": [0, 1, 3, 4, 6, 8, 10, 11], "cache_dir": 13, "cacheabl": 2, "cached_properti": 13, "call": [0, 1, 2, 3, 6, 7, 8, 10, 12, 13, 15], "callabl": [0, 1, 2], "can": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 15], "candid": 2, "cannot": [0, 7, 15], "capabl": 6, "cardin": 3, "care": [0, 1, 3, 4, 15], "carri": 2, "carrier": 1, "case": [1, 3, 6, 7, 8, 9, 10, 12, 15], "cast": 13, "categori": [6, 10], "caus": 15, "cc": 13, "cdim": [13, 15], "cell": [1, 3, 7, 9, 15], "cell2vertex": [1, 7], "certain": [3, 6, 12], "cflag": 13, "cg": 8, "chanc": 6, "chang": [0, 1, 2, 13], "char": 1, "characterist": 6, "chatti": 13, "check": [10, 13], "check_arg": 13, "check_src_hash": 13, "chip": [3, 7], "choic": [0, 8], "choleski": 8, "choos": [6, 11], "chosen": [0, 6, 8], "chunk": [6, 7, 11], "circumst": 9, "claim": 15, "clash": 1, "class": [0, 4, 13, 15], "clk_local_mem_f": 1, "coalesc": 8, "code": [0, 1, 2, 6, 7, 8, 10, 11, 12, 13], "codegen": 13, "coincid": 3, "col": 8, "colidx": 8, "collect": [2, 8, 11, 15], "collector": 2, "colmap": 8, "colmapdim": 8, "colour": [0, 1, 3, 4, 10], "column": [3, 8, 9, 15], "combin": [6, 9], "come": 6, "comm": [13, 15], "comm_world": 13, "common": [1, 3, 6, 9, 13], "commonli": [7, 8], "commun": [4, 13, 15], "compar": [2, 10], "compat": 15, "compil": [0, 1, 2, 4, 6], "compilationerror": 13, "complement": 8, "complet": [0, 1, 6, 10], "complex": [3, 7, 15], "compon": [7, 9, 15], "compos": 15, "composedmap": [13, 15], "composit": 15, "compress": 8, "compris": [0, 8], "comput": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 15], "compute_if_not_empti": 10, "compute_kernel_flop": 13, "concat": 12, "concaten": [8, 15], "concept": [0, 4, 7, 8], "conceptu": 6, "concret": 0, "concurr": [10, 11], "condit": [1, 11], "configur": [4, 8], "configurationerror": 13, "conflict": [0, 1, 15], "conjug": [8, 15], "connect": 3, "consecut": 1, "consequ": 2, "consid": [1, 3, 4, 6, 7, 8, 9], "consider": 7, "consist": [10, 15], "const": [0, 1], "constant": [3, 13, 15], "constitu": [9, 15], "constrain": 15, "constrained_s": [13, 15], "construct": [2, 3, 6, 8, 9, 11, 12, 15], "constructor": [3, 7], "contain": [4, 5, 6, 8, 11, 15], "content": 4, "context": [3, 13, 15], "contigu": [7, 9, 10, 11, 15], "contrast": [1, 3], "contribut": [3, 8, 9, 10, 15], "conveni": 9, "convent": 0, "convers": 1, "convert": 13, "cooper": 1, "coord": [1, 3, 7], "coordin": [1, 3, 7], "copi": [1, 13, 15], "core": [0, 1, 3, 6, 10, 15], "core_part": [10, 13, 15], "core_s": [13, 15], "correct": [9, 15], "correspond": [0, 1, 6, 7, 8, 15], "correspondond": 10, "cost": [1, 2, 10], "could": [6, 7, 13], "count": [6, 13], "coupl": 9, "cours": [3, 6], "coveni": 7, "cprofil": 12, "cpu": [1, 4], "creat": [1, 3, 6, 7, 13, 15], "creation": [6, 7], "critic": [0, 13], "crucial": 3, "csr": 8, "csrdata": 8, "ctype": [1, 13, 15], "cuda": [0, 7, 8], "current": [0, 1, 2, 6, 7, 8, 11], "cusp": [1, 8], "custom": 3, "cxx": 13, "cxxflag": 13, "cython": 0, "d": [2, 8, 9, 15], "dat": [0, 1, 4, 6, 7, 8, 10, 11, 12, 13], "dat1": 15, "dat_vers": [13, 15], "data": [0, 1, 2, 4, 6, 8, 9, 10, 11, 12, 13, 15], "data_carri": 13, "data_ro": [1, 13, 15], "data_ro_with_halo": [13, 15], "data_with_halo": [13, 15], "data_wo": [13, 15], "data_wo_with_halo": [13, 15], "datacarri": [13, 15], "dataset": [1, 2, 3, 4, 7, 8, 13], "datasettypeerror": 13, "datastructur": 1, "datatyp": [3, 4, 7], "datatypeerror": 13, "datavalueerror": 13, "date": [1, 4, 10, 15], "datn": 15, "dattypeerror": 13, "datview": [13, 15], "deal": 12, "debug": [4, 13], "decl": 6, "declar": [0, 3, 7, 8, 9], "decomposit": 8, "decor": [6, 12, 13], "decoupl": 3, "decref": 13, "decrement": 13, "deep": 15, "def": 12, "default": [0, 8, 12, 13, 15], "defin": [0, 1, 2, 3, 7, 8, 9, 10, 13, 15], "definit": 13, "degre": [3, 10], "deleg": 6, "demand": [6, 15], "demo": 12, "denot": 6, "depend": [2, 5, 6, 10], "describ": [0, 1, 3, 6, 7, 8, 9, 10, 11, 15], "descript": [13, 15], "descriptor": [1, 7, 15], "deserialis": 3, "design": 0, "desir": 15, "destin": 15, "detail": [0, 1, 3, 8, 13, 15], "determin": [1, 8, 13], "develop": 6, "devic": [0, 3, 4, 11], "device_unalloc": 1, "diag": 8, "diagon": 8, "diagram": [1, 9, 10], "dict": 13, "dictionari": [6, 8], "differ": [1, 2, 3, 6, 10, 13, 15], "dim": [3, 7, 13, 15], "dim1": 15, "dimens": [1, 3, 6, 7, 8, 13, 15], "dimension": [1, 3, 6, 8], "dimn": 15, "dimtypeerror": 13, "direct": [1, 3, 7, 15], "directli": [1, 3, 6, 7, 8], "dirti": 15, "disabl": [10, 15], "disk": [0, 1, 2], "distinct": [6, 11, 15], "distinguish": [2, 3, 6, 13, 15], "distribu": 8, "distribut": [0, 1, 4, 7, 8], "divid": [1, 8, 10], "divis": 15, "dm": [13, 15], "do": [2, 3, 10, 15], "doc": 13, "docstr": [0, 13], "document": [8, 15], "doe": [0, 1, 2, 3, 7, 8, 12, 15], "dof": [10, 13, 15], "domain": [3, 6], "don": [13, 15], "done": [2, 3, 8, 10, 11, 13], "dot": 12, "doubl": [1, 3, 6, 7, 8, 9, 13], "doubli": 7, "draw": 7, "drive": 6, "driver": 1, "ds2": 2, "dsc": 9, "dset1": 15, "dsetn": 15, "dsl": [3, 6], "dso": 6, "dsr": 9, "dtype": [3, 7, 8, 13, 15], "dtype_limit": 13, "due": 7, "dummi": 15, "dump_stat": 12, "duplic": [13, 15], "durat": 0, "dure": [3, 7, 10, 13], "dvertic": 3, "dynam": [0, 9], "e": [1, 3, 6, 7, 8, 12, 13, 15], "each": [1, 2, 3, 7, 8, 9, 10, 11, 13, 15], "earli": 6, "edg": [3, 11], "edge_weight": 6, "edges2vertic": 3, "effect": 2, "effici": [0, 3, 6, 7, 10, 11], "effort": 6, "efirst": 1, "eg": 13, "egd": 3, "either": [1, 2, 15], "el": 7, "elast": 3, "elem_nod": 8, "element": [1, 3, 6, 7, 8, 9, 11, 15], "els": [1, 8, 15], "embed": 6, "emblemat": 6, "emploi": 15, "emptydatamixin": [13, 15], "enabl": [6, 12, 15], "encapsul": [0, 6], "end": [1, 6, 8, 10, 15], "endif": 1, "enforc": [1, 6], "engin": 6, "enqueu": 1, "ensur": [1, 2, 9], "enter": 13, "entir": [0, 1, 6, 9, 10, 11, 12], "entit": 10, "entiti": [3, 10, 15], "entri": [7, 8, 10, 15], "enumer": 15, "environ": [0, 1, 2, 12], "ep": 3, "equal": [2, 8, 11], "equat": 9, "equival": [3, 9], "error": [2, 13, 15], "establish": 0, "estim": 15, "etc": 7, "evalu": 13, "even": 3, "evenli": 8, "event": 13, "everi": [1, 2, 15], "everyth": 12, "everywher": 3, "evict": 2, "exactli": 6, "exampl": [1, 2, 3, 6, 7, 8, 9, 15], "excel": 12, "except": [0, 4, 8, 11], "exchang": [4, 12, 15], "exclud": [3, 15], "exclus": 8, "exec": 10, "exec_part": 10, "execut": [0, 1, 2, 3, 4, 6, 7, 10, 12, 13, 15], "exisit": 15, "exist": [8, 13, 15], "exit": [2, 12, 13, 15], "expect": [7, 13], "expens": [2, 3, 6, 8], "experi": 6, "explicit": [1, 6, 7], "explicitli": [1, 3, 15], "exploit": [6, 8], "export": 0, "expos": [0, 6], "exposit": 6, "express": [6, 15], "extens": 1, "extent": [6, 7, 15], "extern": 1, "extra": 13, "extract": [6, 15], "extrud": 15, "extruded_period": 15, "extrudedset": [13, 15], "f": 12, "fabian": 12, "fact": [3, 7], "factor": [6, 10], "fail": 13, "fals": [6, 12, 13, 15], "far": 6, "fast": 2, "faster": 8, "featur": [6, 10], "fed": 6, "fem": 9, "fewer": 15, "ffc": 6, "fget": 13, "field": 3, "field_is": [13, 15], "file": [12, 15], "filenam": 15, "filesystem": 13, "final": [6, 9], "finalis": [8, 10], "find": [3, 6], "finish": 15, "finit": [6, 7, 9], "firedrak": 5, "first": [1, 2, 3, 6, 7, 8, 9, 10], "fit": [6, 11], "flag": [1, 7, 13], "flat": 6, "flatblock": 6, "flatten": [7, 13, 15], "flattened_map": [13, 15], "flexibl": 3, "float": [3, 7, 8], "float32": 7, "float64": [7, 8], "fluid": 9, "folder": [6, 12], "follow": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12], "forev": 2, "form": [0, 2, 3, 8, 9, 15], "formal": 13, "format": [4, 13, 15], "former": [8, 13], "formul": 9, "found": [6, 7, 12], "four": [10, 15], "fourth": 7, "fragment": 6, "free": [0, 1, 6], "freedom": [3, 10], "freez": 15, "freeze_halo": [13, 15], "from": [0, 1, 2, 3, 6, 8, 9, 10, 11, 13, 15], "from_siz": 8, "frontend": 0, "frozen": 15, "frozen_halo": [13, 15], "full": [1, 15], "fulli": 6, "func": 12, "function": [0, 1, 2, 3, 6, 7, 8, 12, 13, 15], "fundament": 6, "fundecl": 6, "futur": 6, "g": [3, 6, 12, 13, 15], "gain": 6, "garbag": 2, "gather": [1, 7, 9], "gcc": 13, "gener": [0, 1, 2, 3, 6, 8, 9, 11, 12, 13, 15], "get": [6, 12, 15], "get_group_id": 1, "get_local_id": 1, "get_local_s": 1, "get_petsc_dir": 13, "ghost": 15, "give": [3, 5, 6, 8, 11, 15], "given": [0, 1, 3, 6, 7, 8, 10, 12, 13, 15], "glob": [12, 13], "global": [0, 1, 7, 8, 9, 11, 13, 15], "global_kernel": 4, "global_to_local_begin": [13, 15], "global_to_local_end": [13, 15], "globaldataset": [13, 15], "globalset": [13, 15], "gmre": 8, "go": 6, "goal": 6, "good": 10, "gprof": 12, "gprof2dot": 12, "gpu": [1, 4, 6, 10], "gradient": 8, "graph": 3, "greater": [1, 7], "grid": 1, "griddim": 1, "group": [1, 13], "grow": [2, 10], "guid": 11, "guidelin": 6, "h2d": 1, "ha": [1, 2, 3, 6, 7, 9, 10, 11, 15], "had": 6, "halo": [0, 1, 4, 12, 13], "halo_exchange_begin": 10, "halo_exchange_end": 10, "halo_valid": [13, 15], "hand": [6, 8, 15], "handl": [1, 3, 15], "har": 8, "hard": 1, "hardwar": 1, "harm": 6, "have": [0, 1, 2, 3, 6, 7, 8, 10, 12, 15], "heavi": 2, "help": 12, "helper": 12, "henc": [1, 2], "here": [4, 6], "hide": 3, "hierarchi": 2, "higher": [3, 6, 7], "highli": 6, "highlight": 10, "hit": 2, "hoist": [6, 7], "hold": [2, 15], "host": [0, 4, 8], "host_unalloc": 1, "how": [3, 4, 7, 11, 13], "howev": [2, 4, 9, 10, 15], "huge": 6, "hybrid": 1, "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15], "i0": 8, "i1": 8, "i_1": 1, "id": [1, 7], "ident": [1, 2], "identifi": [1, 3], "idx": [1, 8], "ifdef": 1, "ignor": [6, 12, 13], "illeg": 13, "illustr": [7, 9, 10], "ilu": 8, "imagin": 6, "immedi": 10, "immut": 2, "implcitli": 15, "implement": [0, 1, 2, 3, 6, 7, 9, 10, 15], "impli": 2, "implicitli": [8, 15], "import": [6, 12], "improv": [6, 7], "inaccuraci": 4, "inc": [3, 7, 8, 9, 10, 13, 15], "incid": 3, "includ": [0, 8, 15], "incomplet": 8, "incorpor": 6, "incr": 6, "increas": [6, 7, 11], "incref": 13, "increment": [1, 3, 6, 13, 15], "increment_dat_vers": [13, 15], "incur": 1, "ind_arg1": 1, "ind_arg1_map": 1, "ind_arg1_s": 1, "ind_arg1_shar": 1, "ind_arg1_vec": 1, "ind_map": 1, "ind_off": 1, "ind_siz": 1, "indent": 13, "independ": [2, 3, 10], "index": [1, 3, 4, 7, 8, 13, 15], "indextypeerror": 13, "indexvalueerror": 13, "indic": [1, 3, 8, 11, 13, 15], "indirect": [1, 7, 10, 11, 15], "indirectli": [1, 3, 7, 8, 9, 11], "individu": [1, 6, 15], "infinit": 15, "info": 13, "inform": [0, 3, 11], "infrastructur": 1, "inher": 6, "inherit": 2, "init": [0, 2, 6], "initi": [0, 1, 6, 10, 15], "initialis": [0, 1, 6, 8, 15], "inject": [3, 12], "inlin": 1, "inner": [6, 13, 15], "input": [3, 9], "insert": [8, 10, 15], "insert_mod": 15, "insid": [1, 15], "insofar": 3, "inspect": 6, "instal": [4, 12], "instanc": [2, 6, 9, 15], "instanti": [0, 13], "instead": [3, 6, 10, 15], "instruct": [2, 13], "int": [1, 6, 7, 8, 9, 13, 15], "int32": [7, 15], "int64": 7, "integ": [8, 11, 15], "intenum": 15, "inter": 6, "interchangibli": 7, "interest": [6, 9, 12], "interfac": [0, 1, 4, 6, 15], "interior": 10, "intermedi": [4, 15], "intern": [2, 4, 13], "internal_comm": 13, "interpret": [0, 3], "intersect": [13, 15], "intra": 6, "intuit": 12, "invalid": [1, 13], "invari": 6, "invers": [8, 11], "invoc": 15, "invok": 7, "involv": [0, 1, 5, 6], "iota": 9, "iota_k": 8, "ir": 4, "is_pyop2_comm": 13, "issu": 6, "item": [1, 2, 7, 13], "iter": [0, 1, 2, 3, 4, 6, 8, 9, 11, 13, 15], "iteratevalueerror": 13, "iterationindex": [7, 8], "iterset": [13, 15], "its": [1, 3, 6, 7, 8, 9, 10, 11, 15], "itself": [2, 3, 13], "itset": 10, "itspac": 6, "itvar1": 6, "itvar2": 6, "j": [6, 7, 8, 9], "jacobi": [8, 12], "jam": 6, "jit": [0, 1, 13], "just": [0, 1, 13, 15], "k": [6, 7, 8], "keep": 10, "kei": [2, 6, 8], "kept": 10, "kern": 12, "kernel": [0, 1, 3, 4, 8, 9, 10, 12, 13, 15], "kerneltypeerror": 13, "kernprof": 12, "keyword": [0, 7, 8], "kind": [3, 6], "know": 6, "known": [6, 7], "krylov": [8, 12], "ksp": 8, "ksp_type": 8, "kwarg": [13, 15], "l": 12, "l1": 6, "l2": [3, 15], "l2norm": 3, "label": 15, "lack": 6, "laid": 7, "languag": [3, 6], "larg": [2, 3, 6, 7], "larger": [6, 7, 10], "last": [8, 11], "latenc": 10, "later": [5, 13], "latter": [8, 15], "launch": [1, 7, 8, 10], "layer": [6, 13, 15], "layers_arrai": [13, 15], "layout": [4, 8, 15], "layout_vec": [13, 15], "lazili": 1, "ld": 13, "ldflag": 13, "leak": 4, "leakag": 13, "least": 3, "leav": [1, 6], "left": [9, 15], "len": 12, "lend": 3, "length": [6, 7, 8, 9, 13, 15], "less": 12, "let": 6, "level": [0, 7, 11, 13], "lgmap": [13, 15], "librari": [0, 1, 8, 13], "licm": 6, "lifetim": 2, "like": [3, 6, 12, 13, 15], "limit": [7, 13], "line": [1, 4], "line_profil": 12, "linear": [1, 3, 4, 6, 9, 15], "lineprof": 12, "link": 3, "linker": 13, "list": [0, 1, 6, 7, 9, 13, 15], "littl": 15, "live": [2, 3], "ll": 3, "lmadata": 8, "load": [0, 2, 6, 13, 15], "loc_map": 1, "local": [1, 3, 4, 6, 8, 9, 13, 15], "local_is": [13, 15], "local_kernel": 4, "local_to_global_begin": [13, 15], "local_to_global_end": [13, 15], "local_to_global_numb": [13, 15], "locat": [1, 11], "log": 13, "log_level": 13, "logger": 4, "logic": 15, "long": [1, 2, 7], "look": [6, 15], "lookup": 8, "loop": [0, 1, 2, 4, 6, 7, 8, 11, 12, 15], "loopycompat": 13, "lost": 2, "low": 0, "lower": 7, "lprof": 12, "lu": 8, "m": [9, 12, 15], "mac": 5, "machin": 6, "made": 11, "mai": [0, 2, 3, 6, 7, 10], "main": [5, 6, 10], "mainli": 6, "maintain": 10, "major": 10, "make": [0, 1, 2, 3, 6, 7, 12, 15], "manag": [13, 15], "mani": [2, 3, 4, 6, 7, 9], "manipul": 7, "manner": 8, "manual": [6, 8], "manycor": 7, "map": [0, 1, 2, 4, 6, 7, 8, 11, 13], "map_": 15, "maps_": 15, "maptypeerror": 13, "mapvalueerror": 13, "mark": [10, 15], "marshal": 1, "mask": 11, "mass": 7, "mass_cell_integral_0_otherwis": 8, "mat": [0, 4, 8, 10, 13], "mat_kernel": 8, "matassemblybegin": 8, "matassemblyend": 8, "match": [7, 8, 10], "matnest": [9, 13], "matric": [8, 10, 11, 13, 15], "matrix": [0, 1, 3, 4, 9, 10, 11, 13, 15], "matsetvalu": 8, "mattypeerror": 13, "max": [3, 13, 15], "maxim": 6, "maximis": 10, "maximum": 3, "maybe_set_dat_dirti": 10, "maybe_set_halo_update_need": 10, "mc": 9, "md": 9, "mdat": 15, "mdset": 15, "mdset_or_dat": 15, "mean": [3, 6, 12, 15], "mechan": 8, "member": [6, 15], "memori": [1, 2, 3, 4, 7, 8, 11, 13, 15], "memory_profil": 12, "memprof": 12, "mesh": [0, 3, 6, 10, 15], "messag": 13, "metaclass": 0, "metadata": [2, 3, 15], "method": [6, 8, 9, 15], "midpoint": [1, 3, 7], "might": [6, 15], "min": [3, 13, 15], "mind": 1, "mini": 11, "minim": [6, 8], "minimis": 10, "minimum": 3, "minor": 8, "mitig": 6, "mix": [4, 13], "mixeddat": [9, 13, 15], "mixeddataset": [9, 13, 15], "mixedmap": [9, 13, 15], "mixedmat": 15, "mixedset": [9, 13, 15], "mixin": 15, "mixtur": 15, "mm": 9, "mmap": 9, "mode": [3, 8, 10, 13, 15], "model": 6, "modevalueerror": 13, "modif": [1, 7, 8], "modifi": [1, 6, 7, 10, 15], "modul": [0, 2, 4], "more": [3, 6, 7, 8, 9, 12, 13, 15], "most": [0, 1, 3, 6, 9, 11, 12], "motion": 6, "motiv": 2, "movement": 6, "mpi": [0, 1, 4, 8, 11, 15], "mr": 9, "mset": 15, "msg": 13, "much": [2, 3, 6, 12], "multi": 1, "multigrid": 8, "multipl": [1, 3, 4, 6, 8, 13, 15], "multipli": 15, "must": [1, 2, 3, 6, 7, 8, 9, 10, 15], "my": 12, "my_func": 12, "my_kernel": 6, "n": [1, 8, 9, 12, 15], "name": [1, 3, 6, 7, 8, 12, 13, 15], "namespac": 12, "nametypeerror": 13, "natur": 3, "navier": 9, "nblock": 1, "nbyte": [1, 13, 15], "ncol": 8, "necessari": [0, 1], "necessarili": [3, 11], "need": [1, 3, 7, 8, 10, 15], "needs_exec_halo": 10, "neighbor": 10, "nele": 7, "nelem": [1, 8], "nentries_per_el": 8, "nest": [7, 13, 15], "never": 1, "nevertheless": 4, "new": [4, 11, 13], "newli": [1, 11], "next": [11, 15], "no_fork_avail": 13, "node": [3, 6, 8, 9, 13], "node_local_compil": 13, "non": [3, 6, 8, 10, 15], "none": [6, 10, 13, 15], "norm": [3, 13, 15], "normal": [2, 15], "notat": 15, "note": [1, 3, 7, 9, 12, 15], "noth": [12, 15], "now": 6, "np": 8, "nrow": 8, "nthrcol": 1, "nthread": 1, "num_cel": 7, "num_el": 8, "num_nod": 8, "num_vertic": 7, "number": [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 15], "numer": [3, 6, 13], "numpi": [3, 7, 13, 15], "nvida": 1, "o": [5, 12, 15], "obj": 13, "object": [0, 1, 3, 4, 6, 13, 15], "objectcach": [2, 15], "objet": 8, "obtain": [5, 6, 7, 15], "occur": [1, 2, 9, 15], "odiag": 8, "off": [6, 8, 10, 13], "offload": 1, "offset": [1, 3, 8, 11, 13, 15], "offset_b": 1, "offset_b_ab": 1, "offset_quoti": [13, 15], "often": [3, 9], "omit": [0, 3, 7, 8, 12, 15], "omp": 1, "omp_get_max_thread": 1, "omp_get_thread_num": 1, "omp_num_thread": 1, "onc": [0, 1, 2, 10, 13], "one": [2, 3, 6, 7, 8, 9, 10, 13, 15], "onli": [1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 15], "onto": [2, 15], "op": 6, "op2": [0, 1, 2, 3, 4, 6, 7, 8, 9, 15], "opaquetyp": 13, "opencl": [0, 7], "openmp": 8, "oper": [1, 2, 3, 6, 7, 8, 15], "opportun": 6, "opt": [6, 13], "optim": 4, "optimis": 13, "option": [6, 7, 8, 13, 15], "order": [3, 7, 10, 15], "organis": 1, "orient": 0, "origin": 15, "orthogon": 0, "other": [1, 3, 6, 8, 9, 10, 12, 15], "otherwis": 13, "our": 6, "out": [1, 2, 3, 4, 6, 7, 8, 10, 13], "outcom": 6, "outer": 6, "outermost": 1, "outerproduct": 6, "output": [3, 12], "outsid": 0, "over": [1, 2, 3, 7, 8, 9, 10, 11, 12, 15], "overhead": [1, 15], "overlap": [1, 4], "overridden": 0, "overview": [0, 1, 6], "overwritten": 15, "own": [0, 1, 8, 10, 12, 15], "owned_indic": [13, 15], "owned_part": [10, 13, 15], "p": [1, 7], "p_blk_map": 1, "p_ind_map": 1, "p_ind_offset": 1, "p_ind_siz": 1, "p_loc_map": 1, "p_nelem": 1, "p_nthrcol": 1, "p_offset": 1, "p_thrcol": 1, "pack": 9, "packag": 4, "pad": 6, "page": 4, "pai": [2, 6], "paid": 1, "pair": [1, 3, 8, 9, 11], "par_loop": [0, 1, 2, 3, 7, 8, 9, 10, 11, 15], "parallel": [0, 1, 2, 4, 6, 7, 8, 10, 12, 15], "paramet": [1, 3, 6, 8, 13, 15], "parent": [13, 15], "parloop": [4, 6, 12], "pars": 13, "parse_arg": 13, "parser": [6, 13], "part": [1, 2, 8, 9, 12, 15], "particular": [1, 2, 3, 6, 7, 8], "particularli": 6, "partit": [0, 1, 3, 4, 10, 15], "partition_s": [13, 15], "pase": 8, "pass": [0, 1, 2, 3, 6, 7, 8, 9, 11, 13, 15], "passthrough": 12, "past": 15, "path": [13, 15], "pattern": [3, 4, 9, 12, 13], "payload": 15, "pc": 8, "pc_type": 8, "pdf": 12, "pedregosa": 12, "peopl": 4, "pep": 13, "per": [1, 7, 8, 10, 11, 12, 15], "perform": [0, 2, 4, 7, 8, 10, 11, 15], "permiss": 15, "permut": 15, "permutedmap": [13, 15], "persist": 2, "perspect": 0, "petsc": [1, 8, 9, 10, 12, 13, 15], "petsc4pi": 8, "petsc_count": 15, "phase": 7, "pip": 12, "plai": 6, "plain": [1, 7], "plan": [0, 1, 4, 12], "platform": [5, 6, 7], "pleas": 4, "plot": 12, "po": 8, "point": [1, 3, 7, 15], "pointer": [1, 2, 7, 8], "pointwis": 15, "popular": 8, "portabl": 4, "posit": [7, 8], "possibl": [1, 3, 6, 15], "possibli": 6, "postpon": 15, "potenti": [1, 2], "practic": [0, 10], "pragma": [1, 6], "precondition": [8, 13], "prefer": 15, "prematur": 6, "present": [1, 13], "pressur": [3, 6, 9], "presum": 15, "prevent": [1, 13], "previou": [0, 1, 6], "previous": 3, "price": 10, "primarili": [3, 6], "primit": 3, "print": [2, 12, 13], "print_cache_s": [2, 13], "prior": [1, 3], "privat": 1, "probabl": 13, "problem": 2, "proce": [8, 11], "procedur": [0, 1], "process": [0, 1, 2, 6, 8, 10, 11, 13, 15], "processor": [0, 8, 10], "prod": 6, "produc": [0, 9, 12, 13], "product": [6, 7, 15], "profil": 4, "program": [2, 4, 6, 13], "programat": 12, "progress": 13, "properli": 6, "properti": [1, 13, 15], "proport": [6, 7], "prose": 4, "prove": 6, "provid": [0, 1, 3, 6, 8, 9, 12, 15], "proxi": 15, "pstat": 12, "public": 0, "pure": 3, "purpos": 6, "py": [6, 12], "pyarrayobject": 1, "pycuda": 1, "pyint_aslong": 1, "pyobject": 1, "pyop2": [9, 10, 11], "pyop2_backend": 0, "pyop2_block_spars": 13, "pyop2_cache_dir": 13, "pyop2_cc": 13, "pyop2_cflag": 13, "pyop2_check_src_hash": 13, "pyop2_comm": 13, "pyop2_compute_kernel_flop": 13, "pyop2_cxx": 13, "pyop2_cxxflag": 13, "pyop2_debug": 13, "pyop2_ld": 13, "pyop2_ldflag": 13, "pyop2_log_level": 13, "pyop2_matnest": 13, "pyop2_no_fork_avail": 13, "pyop2_node_local_compil": 13, "pyop2_print_cache_s": [2, 13], "pyop2_print_summari": 12, "pyop2_simd_width": 13, "pyop2_type_check": 13, "pyopencl": 1, "pypi": 12, "python": [0, 1, 2, 5, 7, 12, 13, 15], "qualifi": 1, "quantiti": [3, 9], "queri": [12, 13], "quit": 4, "quotient": 15, "race": [1, 11], "rais": [8, 13], "rang": [0, 6, 8, 13], "rank": [1, 15], "rather": [6, 7, 13, 15], "re": [6, 15], "read": [1, 3, 4, 7, 9, 10, 11, 13, 15], "real": 6, "recal": 1, "receiv": [7, 15], "recognis": 13, "recommend": 5, "recompil": 0, "recomput": [3, 8], "reconfigur": 13, "record": [11, 12], "rectangular": 6, "red": 1, "reduc": [3, 15], "reduct": [0, 10, 11, 12, 15], "reduction_begin": 10, "reduction_end": 10, "redund": 10, "redundantli": [10, 15], "refer": [2, 3, 11, 13], "referenc": 11, "regardless": 0, "region": [0, 1, 10, 12, 13], "regist": [6, 7], "regular": 3, "regularli": [4, 12], "rel": 10, "relat": [10, 15], "relationship": [7, 9], "releas": 6, "reli": 2, "remain": [1, 2], "remov": [12, 15], "render": 15, "renumb": 4, "rep2loopi": 13, "repeatedli": 15, "replac": 13, "repres": [3, 6, 8, 12, 15], "represent": [4, 13, 15], "reqd_work_group_s": 1, "request": [0, 1, 6, 8, 10, 15], "requir": [1, 3, 5, 7, 8, 10, 11, 15], "reset": [11, 13], "reshap": 13, "residu": 8, "resort": 6, "resourc": [4, 7], "respect": [1, 3, 7, 9], "respons": 15, "restrict": [3, 7], "result": [2, 3, 7, 8, 9, 13, 15], "return": [1, 3, 7, 8, 13, 15], "reus": 3, "revers": 15, "rhs_kernel": 8, "right": [8, 9, 15], "robert": 12, "round_up": 1, "routin": 1, "row": [3, 8, 9, 10, 11, 15], "row_ptr": 8, "rowmap": 8, "rowmapdim": 8, "rowptr": 8, "rule": [2, 9], "run": [0, 1, 2, 6, 7, 15], "runtim": [0, 1, 2, 12], "runtimeerror": 13, "rw": [3, 7, 10, 13, 15], "s1": [6, 9], "s2": [6, 9], "safe": [1, 7, 10], "same": [0, 1, 2, 3, 6, 8, 9, 11, 13, 15], "save": [0, 8, 12, 13, 15], "sc0": 9, "sc1": 9, "scalar": [3, 15], "scalar_lgmap": [13, 15], "scan": 8, "scatter": [7, 9, 15], "schedul": [0, 1, 3], "schemat": 0, "scientif": 3, "scope": 1, "script": [5, 12], "search": [4, 8], "second": [1, 2, 6, 7, 9, 13], "section": [3, 6, 10], "see": [1, 7, 9, 12, 13, 15], "select": [0, 4, 15], "self": [13, 15], "semant": [6, 15], "send": 15, "sent": 15, "separ": [0, 1, 2, 8, 9, 10], "sequenti": [0, 6, 8, 11], "seri": 6, "serial": 15, "serialis": 3, "serv": 7, "session": 0, "set": [0, 1, 2, 4, 6, 7, 8, 10, 11, 12, 13], "set1": 15, "set_log_level": 13, "set_offset": 1, "set_siz": 1, "setfreedatacarri": [13, 15], "setn": 15, "setpartit": [13, 15], "settypeerror": 13, "setup": 2, "setvalueerror": 13, "sever": [7, 13], "shape": [3, 7, 13, 15], "share": [0, 1, 8, 9, 11, 13], "short": 1, "shortcut": 6, "should": [2, 5, 6, 11, 13, 15], "show": [1, 6, 15], "shown": 9, "side": [8, 15], "signatur": [1, 6, 7], "signific": [6, 15], "significantli": 4, "silent": 13, "simd": [6, 13], "simd_width": 13, "similar": [1, 3, 8, 10], "similarli": 7, "simpl": [6, 12], "simplest": 6, "simpli": 6, "simul": 2, "simultan": 1, "sinc": [1, 2, 3, 6, 7, 8, 10, 11, 15], "singl": [0, 1, 3, 6, 7, 8, 11, 15], "size": [1, 2, 3, 6, 7, 8, 10, 11, 13, 15], "sizeof": 1, "sizetypeerror": 13, "skip": [7, 15], "slice": 6, "small": [6, 10, 15], "smaller": 6, "smp": 1, "snippet": 2, "so": [1, 2, 3, 6, 11, 15], "soa": 8, "solut": [8, 9], "solv": [3, 4, 9], "solver": [1, 8, 10, 12, 15], "some": [2, 3, 5, 6, 9, 15], "some_map": 15, "somewhat": 5, "soon": [2, 10], "sort": [6, 8, 11], "sourc": [3, 9, 12], "space": [3, 4, 6, 8, 9, 15], "span": 3, "spars": [1, 3, 4], "sparsiti": [2, 3, 4, 12, 15], "sparsityformaterror": 13, "sparsitytypeerror": 13, "specif": [0, 1, 3, 4], "specifi": [3, 6, 15], "speed": 6, "spent": 6, "split": [1, 6, 11, 13, 15], "squar": [8, 9], "sr0": 9, "sr1": 9, "stabil": 8, "stack": 3, "stage": [0, 1, 3, 4, 7, 8, 13, 15], "stand": 6, "standard": [13, 15], "start": [1, 8, 10, 15], "stat": 12, "state": 1, "statement": 6, "static": 1, "statist": 15, "stdout": 12, "step": [0, 6], "still": [6, 10], "stoke": 9, "stop": 15, "storag": [1, 4], "store": [3, 6, 7, 8, 10, 15], "str": 13, "straight": 1, "strategi": [2, 6], "string": [1, 6, 7, 13, 15], "strip": 13, "strive": 6, "structur": [1, 3, 6, 8, 9, 10, 13, 15], "stub": 1, "subblock": 9, "submodul": 4, "subpackag": 4, "subsequ": [2, 8, 10, 15], "subset": [7, 13, 15], "subsetindexoutofbound": 13, "success": [3, 6], "successfulli": 5, "sudo": 12, "suffici": 11, "suggest": [6, 8], "suitabl": 1, "sum": 15, "summari": 12, "summat": 3, "superset": [13, 15], "support": [1, 3, 4, 6, 7, 8, 10, 15], "surject": 3, "switch": 0, "sy": 12, "symbol": 6, "symmetric_differ": [13, 15], "synchron": 6, "synchronis": 1, "syntax": [6, 7], "system": [4, 6, 9], "t": [13, 15], "tabl": 1, "take": [0, 1, 2, 3, 6, 8, 15], "taken": 15, "target": [1, 3, 7], "techniqu": 6, "tell": [3, 6, 12], "temp_internal_comm": 13, "templat": 1, "temporari": 2, "temporarili": 15, "tensor": [3, 6, 7, 8, 9], "term": 6, "termin": 15, "test": [5, 6], "test_iteration_space_dat": 6, "test_matric": 6, "th": 15, "than": [3, 6, 8, 9, 12, 13, 15], "thei": [1, 2, 3, 9, 10, 15], "them": [1, 3, 15], "themselv": 9, "therefor": [1, 3, 5, 6, 8, 10, 12], "thi": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15], "thin": 8, "third": 7, "those": [3, 6, 11, 12], "though": 3, "thought": 6, "thrcol": 1, "thread": [1, 6, 7, 8, 10, 11], "threadidx": [1, 8], "three": [1, 2, 3, 7, 8, 15], "through": [3, 6, 7, 10, 15], "throw": 2, "thu": [2, 4], "ti": 2, "tid": 1, "tile": 6, "time": [0, 1, 6, 7, 11, 12, 13, 15], "timed_funct": [12, 13], "timed_region": [12, 13], "timed_stag": 13, "timer": 4, "tmp": 13, "togeth": 9, "told": 7, "too": [2, 13, 15], "toolchain": 0, "top": [1, 2, 6, 9, 15], "topologi": [0, 3], "toset": [13, 15], "total": [8, 12, 15], "total_s": [13, 15], "touch": [0, 10, 11, 15], "tpdf": 12, "track": 10, "tranfer": 1, "transfer": [0, 1, 3, 8], "transform": 6, "transient": 2, "transit": 1, "translat": [1, 3, 7], "transpar": [0, 9], "treat": [3, 9], "tree": [6, 13], "triangl": 7, "trigger": [1, 6, 10], "trim": 13, "trip": 6, "true": [6, 7, 13], "try": [6, 13, 15], "tupl": [7, 13, 15], "tuplifi": 13, "turn": 13, "two": [1, 2, 3, 6, 8, 10, 11, 15], "typ": 13, "type": [1, 2, 3, 4, 7, 8, 13], "type_check": 13, "typeerror": 13, "typic": 6, "u": 6, "uaj": 6, "ubuntu": 5, "uid1001": 13, "unabl": 13, "unblocked_lgmap": [13, 15], "undefin": 7, "under": [9, 12], "underli": [3, 6, 15], "unfreez": 15, "unfreeze_halo": [13, 15], "uniform": 3, "uniformli": 9, "union": [13, 15], "uniqu": [2, 3, 8, 11], "unit": 1, "unless": 1, "unlik": 15, "unrel": 11, "unrol": 6, "unsaf": 13, "unsafe_reconfigur": 13, "unstructur": [0, 3, 6], "unsupport": 8, "until": [11, 15], "unwrap": 1, "up": [1, 2, 3, 6, 8, 10, 11, 12, 15], "upcast": 15, "updat": [4, 13, 15], "us": [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 13, 15], "usag": [7, 12, 15], "user": [0, 1, 2, 3, 4, 5, 6, 8, 10, 13, 15], "usual": [6, 8, 13], "util": 4, "v": 9, "v_op_uaj": 6, "valid": [1, 3, 8, 9, 13, 15], "validate_bas": 13, "validate_dtyp": 13, "validate_in": 13, "validate_rang": 13, "validate_typ": 13, "valu": [1, 3, 7, 8, 13, 15], "valueerror": 13, "values_with_halo": [13, 15], "variabl": [0, 1, 2, 3, 6, 12, 13], "variou": [6, 8], "vast": 10, "vec": [13, 15], "vec_context": [13, 15], "vec_ro": [13, 15], "vec_wo": [13, 15], "vecaccessmixin": [13, 15], "vect": 6, "vector": [3, 6, 7, 8, 9, 10, 11, 15], "veloc": 9, "venv": 5, "veri": [1, 3, 6, 7, 10, 12], "verifi": 13, "verify_reshap": 13, "versa": [8, 10], "version": [0, 1, 4, 5, 8, 12], "vertex": [3, 7, 11], "vertic": [3, 7, 11, 15], "via": [1, 3, 7, 8, 9, 15], "vice": [8, 10], "view": [3, 15], "visibl": [1, 3], "visit": 6, "void": [1, 3, 6, 7, 8, 9], "volum": 10, "wa": 1, "wai": [1, 3, 5, 6, 7, 9, 10, 11, 12], "wait": 10, "want": [3, 6, 15], "warn": 13, "we": [1, 2, 3, 6, 7, 10, 15], "well": [6, 13], "when": [0, 1, 3, 6, 8, 9, 10, 12, 13, 15], "whenev": 8, "where": [1, 2, 3, 7, 8, 11, 13, 15], "wherea": 1, "whether": 10, "which": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 15], "while": [3, 11], "whilst": 15, "whose": [0, 1, 15], "within": [1, 6, 11, 15], "without": [0, 3, 6, 10, 12, 15], "word": [3, 8], "work": [1, 5, 6, 7, 10, 11, 12], "worri": 2, "would": [3, 6, 7], "wrap": [1, 8, 13], "wrap_mat_kernel__": 8, "wrap_midpoint__": 1, "wrapper": [1, 8, 10], "write": [1, 3, 6, 7, 8, 10, 12, 13, 15], "written": [1, 3, 6, 7], "x": [1, 5, 6, 8, 13], "y": [1, 6], "ye": 13, "yield": [3, 8, 9], "you": [3, 5, 6, 12, 13, 15], "your": [12, 15], "zero": [3, 6, 8, 13, 15]}, "titles": ["PyOP2 Architecture", "PyOP2 Backends", "Caching in PyOP2", "PyOP2 Concepts", "Welcome to PyOP2\u2019s documentation!", "Installing PyOP2", "The PyOP2 Intermediate Representation", "PyOP2 Kernels", "PyOP2 Linear Algebra Interface", "Mixed Types", "MPI", "Parallel Execution Plan", "Profiling", "pyop2 package", "pyop2.codegen package", "pyop2.types package", "pyop2 user documentation"], "titleterms": {"": [4, 12], "The": 6, "access": [3, 15], "achiev": 6, "algebra": 8, "api": 7, "architectur": 0, "assembl": 3, "assembli": [8, 9, 10], "backend": [0, 1], "block": 9, "build": 8, "builder": 14, "cach": [2, 13], "class": 2, "codegen": 14, "colour": 11, "commun": 10, "compil": 13, "comput": 10, "concept": 3, "configur": 13, "consolid": 12, "const": 3, "content": [5, 13, 14, 15], "cpu": 6, "creat": 12, "cuda": 1, "dat": [3, 9, 15], "data": [3, 7], "data_carri": 15, "dataset": [9, 15], "datatyp": 13, "debug": 2, "descriptor": 3, "devic": 1, "differ": 12, "distribut": 10, "document": [4, 16], "except": 13, "exchang": 10, "execut": 11, "format": 8, "from": 12, "glob": 15, "global": 3, "global_kernel": 13, "gpu": 8, "graph": 12, "halo": [10, 15], "host": 1, "how": 6, "indic": 4, "instal": 5, "interfac": 8, "intermedi": 6, "intern": 12, "invoc": 3, "ir": 6, "iter": 7, "kernel": [6, 7], "layout": 7, "leak": 2, "line": 12, "linear": 8, "local": [7, 10, 11], "local_kernel": 13, "logger": 13, "loop": 3, "loopycompat": 14, "map": [3, 9, 15], "mat": [3, 9, 15], "matric": 3, "matrix": 8, "memori": 12, "mix": 9, "modul": [13, 14, 15], "mpi": [10, 13], "multipl": 0, "node": 14, "number": 10, "object": 2, "op2": 13, "opencl": 1, "openmp": 1, "optim": 6, "optimis": 14, "overlap": 10, "packag": [13, 14, 15, 16], "parallel": [3, 11], "parloop": 13, "partit": 11, "pattern": 8, "perform": 6, "plan": 11, "portabl": 6, "profil": [12, 13], "program": 12, "pyop2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16], "reduct": 3, "renumb": 11, "rep2loopi": 14, "represent": [6, 14], "run": 12, "select": 6, "sequenti": 1, "set": [3, 9, 15], "solv": 8, "space": 7, "spars": 8, "sparsiti": [8, 9, 13], "specif": 6, "stage": 11, "storag": 8, "submodul": [13, 14, 15], "subpackag": 13, "support": 0, "system": 8, "tabl": 4, "timer": 12, "type": [9, 15], "us": [6, 12], "user": 16, "util": 13, "version": 13, "welcom": 4}}) \ No newline at end of file diff --git a/user.html b/user.html new file mode 100644 index 000000000..d236563a7 --- /dev/null +++ b/user.html @@ -0,0 +1,125 @@ + + + + + + + + pyop2 user documentation — PyOP2 2020.0 documentation + + + + + + + + + + + + + + +
+
+
+
+ +
+

pyop2 user documentation

+
+

pyop2 Package

+
+
+ + +
+
+
+
+ +
+
+ + + + \ No newline at end of file