Merge pull request #4 from CNugteren/development

Updated to version 5.0
CNugteren · Apr 22, 2016 · 4a81c2b · 4a81c2b
2 parents 0ad67ce + d9eb1aa
commit 4a81c2b
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 32 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,4 +1,11 @@
 
+Version 5.0 (2016-04-21):
+- Buffers can now also be 'not owned' to disable automatic memory freeing afterwards
+- Made 'Buffer::Read' and 'Buffer::ReadAsync' constant methods
+- Added new methods to the API:
+  * Event::WaitForCompletion (OpenCL only)
+  * Kernel::Launch (version with OpenCL waiting list)
+
 Version 4.0 (2015-11-01):
 - Made 'CopyTo' and 'CopyToAsync' constant methods
 - Added offset support to the Buffer class (credits go to 'ielhelw')

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,7 +31,7 @@
 # CMake project details
 cmake_minimum_required(VERSION 2.8.10)
 project("CLCudaAPI" CXX)
-set(CLCudaAPI_VERSION_MAJOR 4)
+set(CLCudaAPI_VERSION_MAJOR 5)
 set(CLCudaAPI_VERSION_MINOR 0)
 
 # ==================================================================================================

diff --git a/doc/api.md b/doc/api.md
@@ -14,6 +14,9 @@ Creates a new event, to be used for example when timing kernels.
 
 Public method(s):
 
+* `void WaitForCompletion() const`:
+Waits for completion of an event (OpenCL) or does nothing (CUDA).
+
 * `float GetElapsedTime() const`:
 Retrieves the elapsed time in milliseconds of the last recorded event (e.g. a device kernel). This method first makes sure that the last event is finished before computing the elapsed time.
 
@@ -162,13 +165,13 @@ template \<typename T\> CLCudaAPI::Buffer
 
 Constants(s):
 
-* `enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite }`
-Defines the different access types for the buffers. Writing to a read-only buffer will throw an error, as will reading from a write-only buffer.
+* `enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }`
+Defines the different access types for the buffers. Writing to a read-only buffer will throw an error, as will reading from a write-only buffer. A buffer which is of type `kNotOwned` will not be automatically freed afterwards.
 
 Constructor(s):
 
 * `Buffer(const Context &context, const BufferAccess access, const size_t size)`:
-Initializes a new linear 1D memory buffer on the device of type T. This buffer is allocated with a fixed number of elements given by `size`. Note that the buffer's elements are not initialized. The buffer can be read-only, write-only, or read-write, as specified by the `access` argument.
+Initializes a new linear 1D memory buffer on the device of type T. This buffer is allocated with a fixed number of elements given by `size`. Note that the buffer's elements are not initialized. The buffer can be read-only, write-only, read-write, or not-owned as specified by the `access` argument.
 
 * `Buffer(const Context &context, const size_t size)`:
 As above, but now defaults to read-write access.
@@ -178,12 +181,12 @@ Creates a new buffer based on data in a linear C++ container (such as `std::vect
 
 Public method(s):
 
-* `void ReadAsync(const Queue &queue, const size_t size, T* host)` and
+* `void ReadAsync(const Queue &queue, const size_t size, T* host) const` and
 `void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host)` and
 `void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host)`:
 Copies `size` elements from the current device buffer to the target host buffer. The host buffer has to be pre-allocated with a size of at least `size` elements. This method is a-synchronous: it can return before the copy operation is completed.
 
-* `void Read(const Queue &queue, const size_t size, T* host)` and
+* `void Read(const Queue &queue, const size_t size, T* host) const` and
 `void Read(const Queue &queue, const size_t size, std::vector<T> &host)` and
 `void Read(const Queue &queue, const size_t size, BufferHost<T> &host)`:
 As above, but now completes the operation before returning.
@@ -229,5 +232,7 @@ Retrieves the amount of on-chip scratchpad memory (local memory in OpenCL, share
 * `Launch(const Queue &queue, const std::vector<size_t> &global, const std::vector<size_t> &local, Event &event)`:
 Launches a kernel onto the specified queue. This kernel launch is a-synchronous: this method can return before the device kernel is completed. The total number of threads launched is equal to the `global` vector; the number of threads per OpenCL work-group or CUDA thread-block is given by the `local` vector. The elapsed time is recorded into the `event` argument.
 
+* `Launch(const Queue &queue, const std::vector<size_t> &global, const std::vector<size_t> &local, Event &event, std::vector<Event>& waitForEvents)`: As above, but now this kernel is only launched after the other specified events have finished (OpenCL only).
+
 * `Launch(const Queue &queue, const std::vector<size_t> &global, Event &event)`: As above, but now the local size is determined automatically (OpenCL only).
 
diff --git a/include/clpp11.h b/include/clpp11.h
@@ -12,7 +12,7 @@
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
-// This is version 4.0 of CLCudaAPI.
+// This is version 5.0 of CLCudaAPI.
 //
 // =================================================================================================
 //
@@ -77,11 +77,16 @@ class Event {
   // Regular constructor
   explicit Event(): event_(nullptr) { }
 
+  // Waits for completion of this event
+  void WaitForCompletion() const {
+    CheckError(clWaitForEvents(1, &event_));
+  }
+
   // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
   // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
   // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
   float GetElapsedTime() const {
-    CheckError(clWaitForEvents(1, &event_));
+    WaitForCompletion();
     auto bytes = size_t{0};
     clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
     auto time_start = size_t{0};
@@ -94,10 +99,14 @@ class Event {
 
   // Accessor to the private data-member
   cl_event& operator()() { return event_; }
+  cl_event* pointer() { return &event_; }
  private:
   cl_event event_;
 };
 
+// Pointer to an OpenCL event
+using EventPointer = cl_event*;
+
 // =================================================================================================
 
 // C++11 version of 'cl_platform_id'
@@ -420,7 +429,7 @@ class BufferHost {
 // =================================================================================================
 
 // Enumeration of buffer access types
-enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite };
+enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
 
 // C++11 version of 'cl_mem'
 template <typename T>
@@ -430,13 +439,17 @@ class Buffer {
   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
   explicit Buffer(const cl_mem buffer):
       buffer_(new cl_mem),
-      access_(BufferAccess::kReadWrite) {
+      access_(BufferAccess::kNotOwned) {
     *buffer_ = buffer;
   }
 
-  // Regular constructor with memory management
+  // Regular constructor with memory management. If this class does not own the buffer object, then
+  // the memory will not be freed automatically afterwards.
   explicit Buffer(const Context &context, const BufferAccess access, const size_t size):
-      buffer_(new cl_mem, [](cl_mem* m) { CheckError(clReleaseMemObject(*m)); delete m; }),
+      buffer_(new cl_mem, [access](cl_mem* m) {
+        if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); }
+        delete m;
+      }),
       access_(access) {
     auto flags = cl_mem_flags{CL_MEM_READ_WRITE};
     if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; }
@@ -463,31 +476,33 @@ class Buffer {
   }
 
   // Copies from device to host: reading the device buffer a-synchronously
-  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
     if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
     CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
                                    host, 0, nullptr, nullptr));
   }
   void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
-                 const size_t offset = 0) {
+                 const size_t offset = 0) const {
     if (host.size() < size) { Error("target host buffer is too small"); }
     ReadAsync(queue, size, host.data(), offset);
   }
   void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
-                 const size_t offset = 0) {
+                 const size_t offset = 0) const {
     if (host.size() < size) { Error("target host buffer is too small"); }
     ReadAsync(queue, size, host.data(), offset);
   }
 
   // Copies from device to host: reading the device buffer
-  void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+  void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
     ReadAsync(queue, size, host, offset);
     queue.Finish();
   }
-  void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
+  void Read(const Queue &queue, const size_t size, std::vector<T> &host,
+            const size_t offset = 0) const {
     Read(queue, size, host.data(), offset);
   }
-  void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
+  void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
+            const size_t offset = 0) const {
     Read(queue, size, host.data(), offset);
   }
 
@@ -602,6 +617,25 @@ class Kernel {
                                       0, nullptr, &(event())));
   }
 
+  // As above, but with an event waiting list
+  void Launch(const Queue &queue, const std::vector<size_t> &global,
+              const std::vector<size_t> &local, Event &event,
+              std::vector<Event>& waitForEvents) {
+    if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
+
+    // Builds a plain version of the events waiting list
+    auto waitForEventsPlain = std::vector<cl_event>();
+    for (auto &waitEvent : waitForEvents) {
+      waitForEventsPlain.push_back(waitEvent());
+    }
+
+    // Launches the kernel while waiting for other events
+    CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
+                                      nullptr, global.data(), local.data(),
+                                      waitForEventsPlain.size(), waitForEventsPlain.data(),
+                                      &(event())));
+  }
+
   // As above, but with the default local workgroup size
   void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),

diff --git a/include/cupp11.h b/include/cupp11.h
@@ -12,7 +12,7 @@
 // Portability here means that a similar header exists for OpenCL with the same classes and
 // interfaces. In other words, moving from the CUDA API to the OpenCL API becomes a one-line change.
 //
-// This is version 4.0 of CLCudaAPI.
+// This is version 5.0 of CLCudaAPI.
 //
 // =================================================================================================
 //
@@ -94,6 +94,9 @@ class Event {
     CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT));
   }
 
+  // Waits for completion of this event (not implemented for CUDA)
+  void WaitForCompletion() const { }
+
   // Retrieves the elapsed time of the last recorded event
   float GetElapsedTime() const {
     auto result = 0.0f;
@@ -109,6 +112,9 @@ class Event {
   std::shared_ptr<CUevent> end_;
 };
 
+// Pointer to a CUDA event
+using EventPointer = CUevent*;
+
 // =================================================================================================
 
 // The CUDA platform: initializes the CUDA driver API
@@ -375,7 +381,7 @@ class BufferHost {
 // =================================================================================================
 
 // Enumeration of buffer access types
-enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite };
+enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
 
 // C++11 version of 'CUdeviceptr'
 template <typename T>
@@ -385,13 +391,17 @@ class Buffer {
   // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
   explicit Buffer(const CUdeviceptr buffer):
       buffer_(new CUdeviceptr),
-      access_(BufferAccess::kReadWrite) {
+      access_(BufferAccess::kNotOwned) {
     *buffer_ = buffer;
   }
 
-  // Regular constructor with memory management
+  // Regular constructor with memory management. If this class does not own the buffer object, then
+  // the memory will not be freed automatically afterwards.
   explicit Buffer(const Context &, const BufferAccess access, const size_t size):
-      buffer_(new CUdeviceptr, [](CUdeviceptr* m) { CheckError(cuMemFree(*m)); delete m; }),
+      buffer_(new CUdeviceptr, [access](CUdeviceptr* m) {
+        if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); }
+        delete m;
+      }),
       access_(access) {
     CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T)));
   }
@@ -412,30 +422,32 @@ class Buffer {
   }
 
   // Copies from device to host: reading the device buffer a-synchronously
-  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
     if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
     CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue()));
   }
   void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
-                 const size_t offset = 0) {
+                 const size_t offset = 0) const {
     if (host.size() < size) { Error("target host buffer is too small"); }
     ReadAsync(queue, size, host.data(), offset);
   }
   void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
-                 const size_t offset = 0) {
+                 const size_t offset = 0) const {
     if (host.size() < size) { Error("target host buffer is too small"); }
     ReadAsync(queue, size, host.data(), offset);
   }
 
   // Copies from device to host: reading the device buffer
-  void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+  void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
     ReadAsync(queue, size, host, offset);
     queue.Finish();
   }
-  void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
+  void Read(const Queue &queue, const size_t size, std::vector<T> &host,
+            const size_t offset = 0) const {
     Read(queue, size, host.data(), offset);
   }
-  void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
+  void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
+            const size_t offset = 0) const {
     Read(queue, size, host.data(), offset);
   }
 
@@ -485,7 +497,7 @@ class Buffer {
   }
 
   // Accessors to the private data-members
-  const CUdeviceptr operator()() const { return *buffer_; }
+  CUdeviceptr operator()() const { return *buffer_; }
   CUdeviceptr& operator()() { return *buffer_; }
  private:
   std::shared_ptr<CUdeviceptr> buffer_;
@@ -536,7 +548,7 @@ class Kernel {
 
   // Retrieves the amount of local memory used per work-group for this kernel. Note that this the
   // shared memory in CUDA terminology.
-  size_t LocalMemUsage(const Device &device) const {
+  size_t LocalMemUsage(const Device &) const {
     auto result = 0;
     CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_));
     return static_cast<size_t>(result);
@@ -566,9 +578,18 @@ class Kernel {
     CheckError(cuEventRecord(event.end(), queue()));
   }
 
+  // As above, but with an event waiting list
+  // TODO: Implement this function
+  void Launch(const Queue &queue, const std::vector<size_t> &global,
+              const std::vector<size_t> &local, Event &event,
+              std::vector<Event>& waitForEvents) {
+    if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
+    Error("launching with an event waiting list is not implemented for the CUDA back-end");
+  }
+
   // As above, but with the default local workgroup size
   // TODO: Implement this function
-  void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
+  void Launch(const Queue &, const std::vector<size_t> &, Event &) {
     Error("launching with a default workgroup size is not implemented for the CUDA back-end");
   }