From bd6c0ba6596fa49c20c3a1a49c0bf663e3a5d5fb Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Mon, 22 Jan 2024 15:08:49 +0200
Subject: [PATCH] Regards #573: Removed unnecessary inclusions of
 `<cuda_runtime.h>`

---
 src/cuda/api/apriori_compiled_kernel.hpp      |  5 +++++
 src/cuda/api/array.hpp                        |  2 --
 src/cuda/api/kernel.hpp                       |  2 --
 src/cuda/api/kernel_launch.hpp                |  9 +++++---
 src/cuda/api/memory.hpp                       |  3 ++-
 src/cuda/api/miscellany.hpp                   | 21 ++++++++++---------
 .../api/multi_wrapper_impls/kernel_launch.hpp |  3 +++
 src/cuda/api/texture_view.hpp                 |  1 -
 8 files changed, 27 insertions(+), 19 deletions(-)
diff --git a/src/cuda/api/apriori_compiled_kernel.hpp b/src/cuda/api/apriori_compiled_kernel.hpp
index af8aaada..b98c70d8 100644
--- a/src/cuda/api/apriori_compiled_kernel.hpp
+++ b/src/cuda/api/apriori_compiled_kernel.hpp
@@ -10,6 +10,11 @@
 
 #include "kernel.hpp"
 #include "current_context.hpp"
+
+// The following is needed for occupancy-related calculation convenience
+// and kernel-attribute-related API functions
+#include <cuda_runtime.h>
+
 #include <type_traits>
 
 namespace cuda {
diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp
index 6fc78de7..03c9fa9b 100644
--- a/src/cuda/api/array.hpp
+++ b/src/cuda/api/array.hpp
@@ -15,8 +15,6 @@
 #include "context.hpp"
 #include "error.hpp"
 
-#include <cuda_runtime.h>
-
 #ifndef CUDA_NO_HALF
 #include <cuda_fp16.h>
 #endif
diff --git a/src/cuda/api/kernel.hpp b/src/cuda/api/kernel.hpp
index 03101d6b..44a29f7d 100644
--- a/src/cuda/api/kernel.hpp
+++ b/src/cuda/api/kernel.hpp
@@ -16,8 +16,6 @@
 #include "types.hpp"
 #include "current_context.hpp"
 
-#include <cuda_runtime.h>
-
 #if CUDA_VERSION < 11000
 #define CAN_GET_APRIORI_KERNEL_HANDLE 0
 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE virtual
diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp
index ca21de13..3992b991 100644
--- a/src/cuda/api/kernel_launch.hpp
+++ b/src/cuda/api/kernel_launch.hpp
@@ -32,9 +32,7 @@
  * @note Even though when you use this wrapper, your code will not have the silly
  * chevron, you can't use it from regular `.cpp` files compiled with your host
  * compiler. Hence the `.cuh` extension. You _can_, however, safely include this
- * file from your `.cpp` for other definitions. Theoretically, we could have
- * used the `cudaLaunchKernel` API function, by creating an array on the stack
- * which points to all of the other arguments, but that's kind of redundant.
+ * file from your `.cpp` for other definitions.
  *
  */
 
@@ -46,6 +44,11 @@
 #include "kernel.hpp"
 #include "apriori_compiled_kernel.hpp"
 
+#if CUDA_VERSION >= 9000
+// The following is necessary for cudaLaunchCooperativeKernel
+#include <cuda_runtime.h>
+#endif // CUDA_VERSION >= 9000
+
 #include <type_traits>
 #include <utility>
 
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index a641df64..16436c8f 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -34,7 +34,8 @@
 #include "pointer.hpp"
 #include "current_context.hpp"
 
-#include <cuda_runtime.h> // needed, rather than cuda_runtime_api.h, e.g. for cudaMalloc
+// The following is needed for cudaGetSymbolAddress, cudaGetSymbolSize
+#include <cuda_runtime.h>
 
 #include <memory>
 #include <cstring> // for ::std::memset
diff --git a/src/cuda/api/miscellany.hpp b/src/cuda/api/miscellany.hpp
index 08c4f4d1..6d04f547 100644
--- a/src/cuda/api/miscellany.hpp
+++ b/src/cuda/api/miscellany.hpp
@@ -9,8 +9,6 @@
 #define CUDA_API_WRAPPERS_MISCELLANY_HPP_
 
 #include "types.hpp"
-
-#include <cuda_runtime_api.h>
 #include "error.hpp"
 
 #include <ostream>
@@ -27,14 +25,14 @@ namespace cuda {
  */
 inline void initialize_driver()
 {
-	static constexpr const unsigned dummy_flags { 0 }; // this is the only allowed value for flags
+	static constexpr const unsigned dummy_flags{0}; // this is the only allowed value for flags
 	auto status = cuInit(dummy_flags);
 	throw_if_error_lazy(status, "Failed initializing the CUDA driver");
 }
 
 inline void ensure_driver_is_initialized()
 {
-	thread_local bool driver_known_to_be_initialized { false };
+	thread_local bool driver_known_to_be_initialized{false};
 	if (not driver_known_to_be_initialized) {
 		initialize_driver();
 		driver_known_to_be_initialized = true;
@@ -58,14 +56,17 @@ namespace device {
 inline device::id_t count()
 {
 	initialize_driver();
-		// This function is often called before any device is obtained (which is where we
-		// expect the driver to be initialized)
+	// This function is often called before any device is obtained (which is where we
+	// expect the driver to be initialized)
 	int device_count = 0; // Initializing, just to be on the safe side
 	status_t result = cuDeviceGetCount(&device_count);
-	switch(result) {
-		case status::no_device: return 0;
-		case status::success: break;
-		default: throw runtime_error(result, "Failed obtaining the number of CUDA devices on the system");
+	switch (result) {
+	case status::no_device:
+		return 0;
+	case status::success:
+		break;
+	default:
+		throw runtime_error(result, "Failed obtaining the number of CUDA devices on the system");
 	}
 	if (device_count < 0) {
 		throw ::std::logic_error("cudaGetDeviceCount() reports an invalid number of CUDA devices");
diff --git a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
index c12c5f30..523fc90f 100644
--- a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
+++ b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
@@ -16,6 +16,9 @@
 #include "../pointer.hpp"
 #include "../device.hpp"
 
+// The following is needed for occupancy-related calculation convenience functions
+#include <cuda_runtime.h>
+
 namespace cuda {
 
 namespace detail_ {
diff --git a/src/cuda/api/texture_view.hpp b/src/cuda/api/texture_view.hpp
index 29601697..d28a32e8 100644
--- a/src/cuda/api/texture_view.hpp
+++ b/src/cuda/api/texture_view.hpp
@@ -12,7 +12,6 @@
 #include "array.hpp"
 #include "error.hpp"
 #include "memory.hpp"
-#include <cuda_runtime.h>
 
 namespace cuda {