Iteration over documentation and API

pc2 · Sep 1, 2021 · af56949 · af56949
1 parent 76223d3
commit af56949
Show file tree

Hide file tree

Showing 9 changed files with 228 additions and 125 deletions.
diff --git a/README.md b/README.md
@@ -138,7 +138,7 @@ int main(int argc, char **argv) {
     executor.set_input(grid_buffer);
 ```
 
-After checking and parsing the arguments, we read the input data. Then, we pick and initialize an executor. Executors are the user-facing facades of StencilStream and the library offers different executors that are optimized for different scenarios. In this case, we pick the `StencilExecutor`, which is the general-purpose, jack-of-all-trades executor. The static operation and performance parameters are defined as template parameters to an executor. In it's simplest form, it only requires our cell type, the radius of the stencil and the type of the transition function. 
+After checking and parsing the arguments, we read the input data. Then, we pick and initialize an executor. Executors are the user-facing facades of StencilStream and the library offers different executors that are optimized for different scenarios. In this case, we pick the `StencilExecutor`, which is the most universal executor. The static operation and performance parameters are defined as template parameters to an executor. In it's simplest form, it only requires our cell type, the radius of the stencil and the type of the transition function. 
 
 ``` C++
 #ifdef HARDWARE

diff --git a/StencilStream/AbstractExecutor.hpp b/StencilStream/AbstractExecutor.hpp
@@ -22,31 +22,119 @@
 #include <CL/sycl.hpp>
 
 namespace stencil {
+/**
+ * \brief Base class for all execution managers.
+ *
+ * Executors are the user-facing facades of StencilStream that orchestrate the computations.
+ * Different executors may use different architectures and strategies to apply transition functions
+ * to cells. Application code that may work with any executor can use this base class to access
+ * them. It has multiple logical attributes that can be configured:
+ *
+ * ### Grid
+ *
+ * The grid is the logical array of cells, set with \ref AbstractExecutor.set_input. A stencil
+ * executor does not work in place and a buffer used to initialize the grid can be used for other
+ * tasks afterwards. The \ref AbstractExecutor.run method alters the state of the grid and the grid
+ * can be copied back to a given buffer using \ref AbstractExecutor.copy_output.
+ *
+ * ### Transition Function
+ *
+ * A stencil executor stores an instance of the transition function since it may require some
+ * configuration and runtime-dynamic parameters too. An instance is required for the initialization,
+ * but it may be replaced at any time with \ref AbstractExecutor.set_trans_func.
+ *
+ * ### Generation Index
+ *
+ * This is the generation index of the current state of the grid. \ref AbstractExecutor.run updates
+ * and therefore, it can be ignored in most instances. However, it can be reset if a transition
+ * function needs it.
+ *
+ * \tparam T The cell type.
+ * \tparam stencil_radius The radius of the stencil buffer supplied to the transition function.
+ * \tparam TransFunc The type of the transition function.
+ */
 template <typename T, uindex_t stencil_radius, typename TransFunc> class AbstractExecutor {
   public:
+    /**
+     * \brief Create a new abstract executor.
+     * \param halo_value The value of cells that are outside the grid.
+     * \param trans_func The instance of the transition function that should be used to calculate
+     * new generations.
+     */
     AbstractExecutor(T halo_value, TransFunc trans_func)
         : halo_value(halo_value), trans_func(trans_func), i_generation(0) {}
 
+    /**
+     * \brief Compute the next generations of the grid and store it internally.
+     *
+     * This will use the transition function to compute the next `n_generations` generations of the
+     * grid and store the new state of the grid internally. The resulting grid state can be
+     * retrieved with \ref AbstractExecutor.copy_output.
+     *
+     * \param n_generations The number of generations to calculate.
+     */
     virtual void run(uindex_t n_generations) = 0;
 
+    /**
+     * \brief Set the internal state of the grid.
+     *
+     * This will copy the contents of the buffer to an internal representation. The buffer may be
+     * used for other purposes later. It must not reset the generation index. The range of the input
+     * buffer will be used as the new grid range.
+     *
+     * \param input_buffer The source buffer of the new grid state.
+     */
     virtual void set_input(cl::sycl::buffer<T, 2> input_buffer) = 0;
 
+    /**
+     * \brief Copy the state of the grid to a buffer.
+     *
+     * This will copy the cells of the internal grid representation to the buffer. The range of the
+     * output buffer must be equal to the grid range (retrievable with \ref
+     * AbstractExecutor.get_grid_range).
+     *
+     * \param output_buffer The target buffer.
+     */
     virtual void copy_output(cl::sycl::buffer<T, 2> output_buffer) = 0;
 
+    /**
+     * \brief Get the range of the internal grid.
+     */
     virtual UID get_grid_range() const = 0;
 
+    /**
+     * \brief Get the value of cells outside of the grid.
+     */
     T const get_halo_value() const { return halo_value; }
 
+    /**
+     * \brief Set the value of cells outside of the grid.
+     */
     void set_halo_value(T halo_value) { this->halo_value = halo_value; }
 
+    /**
+     * \brief Get the configured transition function instance.
+     */
     TransFunc get_trans_func() const { return trans_func; }
 
+    /**
+     * \brief Set the transition function instance.
+     */
     void set_trans_func(TransFunc trans_func) { this->trans_func = trans_func; }
 
+    /**
+     * \brief Get the generation index of the grid.
+     */
     uindex_t get_i_generation() const { return i_generation; }
 
+    /**
+     * \brief Set the generation index of the grid.
+     */
     void set_i_generation(uindex_t i_generation) { this->i_generation = i_generation; }
 
+    /**
+     * \brief Increase the generation index of the grid by a certain delta.
+     */
     void inc_i_generation(index_t delta) { this->i_generation += delta; }
 
   private:

diff --git a/StencilStream/Grid.hpp b/StencilStream/Grid.hpp
@@ -30,11 +30,12 @@
 namespace stencil {
 
 /**
- * \brief A rectangular container of cells with a dynamic, arbitrary size.
+ * \brief A rectangular container of cells with a dynamic, arbitrary size, used by the \ref
+ * StencilExecutor.
  *
- * It logically contains the grid the transition function is applied to. As described in \ref
- * tiling, it partitions the grid into tiles of static size, which are the units the \ref
- * TilingExecutionKernel works on.
+ * This class is part of the \ref tiling architecture. It logically contains the grid the transition
+ * function is applied to and it partitions the grid into tiles of static size. These are the units
+ * the \ref TilingExecutionKernel works on.
  *
  * Apart from providing copy operations to and from monolithic grid buffers, it also handles the
  * input and output kernel submission for a given tile.
@@ -48,9 +49,10 @@ namespace stencil {
 template <typename T, uindex_t tile_width, uindex_t tile_height, uindex_t halo_radius,
           uindex_t burst_length>
 class Grid {
-  public:
+  private:
     using Tile = Tile<T, tile_width, tile_height, halo_radius, burst_length>;
 
+  public:
     /**
      * \brief Create a grid with undefined contents.
      *

diff --git a/StencilStream/MonotileExecutionKernel.hpp b/StencilStream/MonotileExecutionKernel.hpp
@@ -19,19 +19,23 @@
  */
 #pragma once
 #include "GenericID.hpp"
+#include "Helpers.hpp"
 #include "Index.hpp"
 #include "Stencil.hpp"
-#include "Helpers.hpp"
 #include <optional>
 
 namespace stencil {
 
 /**
- * \brief A kernel that executes a stencil transition function on a tile.
+ * \brief A kernel that executes a stencil transition function using the monotile approach.
  *
  * It receives the contents of a tile and it's halo from the `in_pipe`, applies the transition
  * function when applicable and writes the result to the `out_pipe`.
  *
+ * With the monotile approach, the whole grid fits in one tile. This eliminates the need to
+ * calculate the cells of the tile halo, reducing the cache size and number of loop iterations. More
+ * is described in \ref monotile.
+ *
  * \tparam TransFunc The type of transition function to use.
  * \tparam T Cell value type.
  * \tparam stencil_radius The static, maximal Chebyshev distance of cells in a stencil to the
@@ -53,14 +57,25 @@ class MonotileExecutionKernel {
      */
     const static uindex_t stencil_diameter = Stencil<T, stencil_radius>::diameter;
 
+    /**
+     * \brief The number of cells in the tile.
+     */
     const static uindex_t n_cells = tile_width * tile_height;
 
+    /**
+     * \brief The number of cells that need to be fed into a stage before it produces correct
+     * values.
+     */
     const static uindex_t stage_latency = stencil_radius * (tile_height + 1);
 
+    /**
+     * \brief The number of cells that need to be fed into the pipeline before it produces correct
+     * values.
+     */
     const static uindex_t pipeline_latency = pipeline_length * stage_latency;
 
     /**
-     * \brief The total number of cells to read from the `in_pipe`.
+     * \brief The total number of loop iterations.
      */
     const static uindex_t n_iterations = pipeline_latency + n_cells;
 
@@ -70,14 +85,19 @@ class MonotileExecutionKernel {
      * \param trans_func The instance of the transition function to use.
      * \param i_generation The generation index of the input cells.
      * \param n_generations The number of generations to compute. If this number is bigger than
-     * `pipeline_length`, only `pipeline_length` generations will be computed. \param grid_width The
-     * number of cell columns in the grid. \param grid_height The number of cell rows in the grid.
+     * `pipeline_length`, only `pipeline_length` generations will be computed.
+     * \param grid_width The number of cell columns in the grid.
+     * \param grid_height The number of cell rows in the grid.
+     * \param halo_value The value of cells outside the grid.
      */
     MonotileExecutionKernel(TransFunc trans_func, uindex_t i_generation, uindex_t n_generations,
                             uindex_t grid_width, uindex_t grid_height, T halo_value)
         : trans_func(trans_func), i_generation(i_generation), n_generations(n_generations),
           grid_width(grid_width), grid_height(grid_height), halo_value(halo_value) {}
 
+    /**
+     * \brief Execute the kernel.
+     */
     void operator()() const {
         [[intel::fpga_register]] index_t c[pipeline_length];
         [[intel::fpga_register]] index_t r[pipeline_length];

diff --git a/StencilStream/MonotileExecutor.hpp b/StencilStream/MonotileExecutor.hpp
@@ -27,11 +27,16 @@
 namespace stencil {
 template <typename T, uindex_t stencil_radius, typename TransFunc, uindex_t pipeline_length = 1,
           uindex_t tile_width = 1024, uindex_t tile_height = 1024, uindex_t burst_size = 1024>
-class MonotileExecutor : public SingleQueueExecutor<T, stencil_radius, TransFunc, pipeline_length> {
+/**
+ * \brief An executor that follows \ref monotile.
+ *
+ *
+ */
+class MonotileExecutor : public SingleQueueExecutor<T, stencil_radius, TransFunc> {
   public:
     static constexpr uindex_t burst_length = std::min<uindex_t>(1, burst_size / sizeof(T));
     static constexpr uindex_t halo_radius = stencil_radius * pipeline_length;
-    using Parent = SingleQueueExecutor<T, stencil_radius, TransFunc, pipeline_length>;
+    using Parent = SingleQueueExecutor<T, stencil_radius, TransFunc>;
 
     MonotileExecutor(T halo_value, TransFunc trans_func)
         : Parent(halo_value, trans_func), tile_buffer(cl::sycl::range<2>(tile_width, tile_height)) {
@@ -72,8 +77,7 @@ class MonotileExecutor : public SingleQueueExecutor<T, stencil_radius, TransFunc
         return UID(tile_buffer.get_range()[0], tile_buffer.get_range()[1]);
     }
 
-  protected:
-    std::optional<double> run_pass(uindex_t target_i_generation) override {
+    void run(uindex_t n_generations) override {
         using in_pipe = cl::sycl::pipe<class monotile_in_pipe, T>;
         using out_pipe = cl::sycl::pipe<class monotile_out_pipe, T>;
         using ExecutionKernelImpl =
@@ -82,58 +86,64 @@ class MonotileExecutor : public SingleQueueExecutor<T, stencil_radius, TransFunc
 
         cl::sycl::queue &queue = this->get_queue();
 
+        uindex_t target_i_generation = this->get_i_generation() + n_generations;
         uindex_t grid_width = tile_buffer.get_range()[0];
         uindex_t grid_height = tile_buffer.get_range()[1];
-        cl::sycl::buffer<T, 2> out_buffer(tile_buffer.get_range());
-
-        queue.submit([&](cl::sycl::handler &cgh) {
-            auto ac = tile_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
-            T halo_value = this->get_halo_value();
-
-            cgh.single_task<class MonotileInputKernel>([=]() {
-                [[intel::loop_coalesce(2)]] for (uindex_t c = 0; c < tile_width; c++) {
-                    for (uindex_t r = 0; r < tile_height; r++) {
-                        T value;
-                        if (c < grid_width && r < grid_height) {
-                            value = ac[c][r];
-                        } else {
-                            value = halo_value;
-                        }
 
-                        in_pipe::write(value);
+        while (this->get_i_generation() < target_i_generation) {
+            cl::sycl::buffer<T, 2> out_buffer(tile_buffer.get_range());
+
+            queue.submit([&](cl::sycl::handler &cgh) {
+                auto ac = tile_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
+                T halo_value = this->get_halo_value();
+
+                cgh.single_task<class MonotileInputKernel>([=]() {
+                    [[intel::loop_coalesce(2)]] for (uindex_t c = 0; c < tile_width; c++) {
+                        for (uindex_t r = 0; r < tile_height; r++) {
+                            T value;
+                            if (c < grid_width && r < grid_height) {
+                                value = ac[c][r];
+                            } else {
+                                value = halo_value;
+                            }
+
+                            in_pipe::write(value);
+                        }
                     }
-                }
+                });
+            });
+
+            cl::sycl::event computation_event = queue.submit([&](cl::sycl::handler &cgh) {
+                cgh.single_task(ExecutionKernelImpl(
+                    this->get_trans_func(), this->get_i_generation(), target_i_generation,
+                    grid_width, grid_height, this->get_halo_value()));
             });
-        });
-
-        cl::sycl::event computation_event = queue.submit([&](cl::sycl::handler &cgh) {
-            cgh.single_task(ExecutionKernelImpl(this->get_trans_func(), this->get_i_generation(),
-                                                target_i_generation, grid_width, grid_height,
-                                                this->get_halo_value()));
-        });
-
-        queue.submit([&](cl::sycl::handler &cgh) {
-            auto ac = out_buffer.template get_access<cl::sycl::access::mode::discard_write>(cgh);
-            T halo_value = this->get_halo_value();
-
-            cgh.single_task<class MonotileInputKernel>([=]() {
-                [[intel::loop_coalesce(2)]] for (uindex_t c = 0; c < tile_width; c++) {
-                    for (uindex_t r = 0; r < tile_height; r++) {
-                        T value = out_pipe::read();
-                        if (c < grid_width && r < grid_height) {
-                            ac[c][r] = value;
+
+            queue.submit([&](cl::sycl::handler &cgh) {
+                auto ac =
+                    out_buffer.template get_access<cl::sycl::access::mode::discard_write>(cgh);
+                T halo_value = this->get_halo_value();
+
+                cgh.single_task<class MonotileInputKernel>([=]() {
+                    [[intel::loop_coalesce(2)]] for (uindex_t c = 0; c < tile_width; c++) {
+                        for (uindex_t r = 0; r < tile_height; r++) {
+                            T value = out_pipe::read();
+                            if (c < grid_width && r < grid_height) {
+                                ac[c][r] = value;
+                            }
                         }
                     }
-                }
+                });
             });
-        });
 
-        tile_buffer = out_buffer;
+            tile_buffer = out_buffer;
+
+            if (this->is_runtime_analysis_enabled()) {
+                this->get_runtime_sample().add_pass(computation_event);
+            }
 
-        if (this->is_runtime_analysis_enabled()) {
-            return RuntimeSample::runtime_of_event(computation_event);
-        } else {
-            return std::nullopt;
+            this->inc_i_generation(
+                std::min(target_i_generation - this->get_i_generation(), pipeline_length));
         }
     }
 

diff --git a/StencilStream/RuntimeSample.hpp b/StencilStream/RuntimeSample.hpp
@@ -56,6 +56,8 @@ class RuntimeSample {
         n_passes += 1.0;
     }
 
+    void add_pass(cl::sycl::event event) { add_pass(runtime_of_event(event)); }
+
     /**
      * \brief Calculate the total runtime of all passes.
      *
-Original file line number
+Diff line change
@@ Expand Up / @@ -56,6 +56,8 @@ class RuntimeSample { @@
             n_passes += 1.0;
         }
+        void add_pass(cl::sycl::event event) { add_pass(runtime_of_event(event)); }
         /**
          * \brief Calculate the total runtime of all passes.
          *
@@ Expand Down @@