From 397b7c7ca23ddf85e199d5cc80aef55c811f1a20 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 25 Oct 2024 20:25:03 +0000 Subject: [PATCH 1/7] support kernel launch with CUDA 11 driver --- cuda_core/cuda/core/experimental/_launcher.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 614fabcf..4951358d 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -19,6 +19,7 @@ class LaunchConfig: """ """ + # TODO: expand LaunchConfig to include other attributes grid: Union[tuple, int] = None block: Union[tuple, int] = None stream: Stream = None @@ -67,24 +68,30 @@ def launch(kernel, config, *kernel_args): if not isinstance(kernel, Kernel): raise ValueError config = check_or_create_options(LaunchConfig, config, "launch config") + if config.stream is None: + raise CUDAError("stream cannot be None") + # TODO: can we ensure kernel_args is valid/safe to use here? + # TODO: merge with HelperKernelParams? + kernel_args = ParamHolder(kernel_args) + args_ptr = kernel_args.ptr driver_ver = handle_return(cuda.cuDriverGetVersion()) if driver_ver >= 12000: drv_cfg = cuda.CUlaunchConfig() drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block - if config.stream is None: - raise CUDAError("stream cannot be None") drv_cfg.hStream = config.stream._handle drv_cfg.sharedMemBytes = config.shmem_size - drv_cfg.numAttrs = 0 # FIXME - - # TODO: merge with HelperKernelParams? - kernel_args = ParamHolder(kernel_args) - args_ptr = kernel_args.ptr - + drv_cfg.numAttrs = 0 # TODO handle_return(cuda.cuLaunchKernelEx( drv_cfg, int(kernel._handle), args_ptr, 0)) else: - raise NotImplementedError("TODO") + # TODO: check if config has any unsupported attrs + handle_return(cuda.cuLaunchKernel( + int(kernel._handle), + *config.grid, + *config.block, + config.shmem_size, + config.stream._handle, + args_ptr, 0)) From b319731f05e15061133cd617732052154c624d46 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 25 Oct 2024 14:00:42 -0700 Subject: [PATCH 2/7] fix module load for cuda-python 11.x --- cuda_core/cuda/core/experimental/_module.py | 28 +++++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index a179faf8..325c307a 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -2,22 +2,31 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +import importlib.metadata + from cuda import cuda, cudart from cuda.core.experimental._utils import handle_return _backend = { - "new": { - "file": cuda.cuLibraryLoadFromFile, - "data": cuda.cuLibraryLoadData, - "kernel": cuda.cuLibraryGetKernel, - }, "old": { "file": cuda.cuModuleLoad, "data": cuda.cuModuleLoadDataEx, "kernel": cuda.cuModuleGetFunction, }, } +_kernel_ctypes = [cuda.CUfunction] + +# binding availability depends on cuda-python version +py_major_ver = int(importlib.metadata.version("cuda-python").split(".")[0]) +if py_major_ver >= 12: + _backend["new"] = { + "file": cuda.cuLibraryLoadFromFile, + "data": cuda.cuLibraryLoadData, + "kernel": cuda.cuLibraryGetKernel, + } + _kernel_ctypes.append(cuda.CUkernel) +_kernel_ctypes = tuple(_kernel_ctypes) class Kernel: @@ -29,7 +38,7 @@ def __init__(self): @staticmethod def _from_obj(obj, mod): - assert isinstance(obj, (cuda.CUkernel, cuda.CUfunction)) + assert isinstance(obj, _kernel_ctypes) assert isinstance(mod, ObjectCode) ker = Kernel.__new__(Kernel) ker._handle = obj @@ -49,7 +58,10 @@ def __init__(self, module, code_type, jit_options=None, *, self._handle = None driver_ver = handle_return(cuda.cuDriverGetVersion()) - self._loader = _backend["new"] if driver_ver >= 12000 else _backend["old"] + if py_major_ver >= 12 and driver_ver >= 12000: + self._loader = _backend["new"] + else: + self._loader = _backend["old"] if isinstance(module, str): if driver_ver < 12000 and jit_options is not None: @@ -65,7 +77,7 @@ def __init__(self, module, code_type, jit_options=None, *, # TODO: support library options [], [], 0) else: - args = (module, len(jit_options), jit_options.keys(), jit_options.values()) + args = (module, len(jit_options), list(jit_options.keys()), list(jit_options.values())) self._handle = handle_return(self._loader["data"](*args)) self._code_type = code_type From b64f3379c1c1599d506ba374374614bdc699ad30 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 25 Oct 2024 18:17:01 -0700 Subject: [PATCH 3/7] simplify & fix module/library handling --- cuda_core/cuda/core/experimental/_module.py | 22 +++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 325c307a..a51ab24f 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -15,7 +15,6 @@ "kernel": cuda.cuModuleGetFunction, }, } -_kernel_ctypes = [cuda.CUfunction] # binding availability depends on cuda-python version py_major_ver = int(importlib.metadata.version("cuda-python").split(".")[0]) @@ -25,8 +24,10 @@ "data": cuda.cuLibraryLoadData, "kernel": cuda.cuLibraryGetKernel, } - _kernel_ctypes.append(cuda.CUkernel) -_kernel_ctypes = tuple(_kernel_ctypes) + _kernel_ctypes = (cuda.CUfunction, cuda.CUkernel) +else: + _kernel_ctypes = (cuda.CUfunction,) +driver_ver = handle_return(cuda.cuDriverGetVersion()) class Kernel: @@ -45,6 +46,8 @@ def _from_obj(obj, mod): ker._module = mod return ker + # TODO: implement from_handle() + class ObjectCode: @@ -57,11 +60,8 @@ def __init__(self, module, code_type, jit_options=None, *, raise ValueError self._handle = None - driver_ver = handle_return(cuda.cuDriverGetVersion()) - if py_major_ver >= 12 and driver_ver >= 12000: - self._loader = _backend["new"] - else: - self._loader = _backend["old"] + backend = "new" if (py_major_ver >= 12 and driver_ver >= 12000) else "old" + self._loader = _backend[backend] if isinstance(module, str): if driver_ver < 12000 and jit_options is not None: @@ -72,11 +72,11 @@ def __init__(self, module, code_type, jit_options=None, *, assert isinstance(module, bytes) if jit_options is None: jit_options = {} - if driver_ver >= 12000: + if backend == "new": args = (module, list(jit_options.keys()), list(jit_options.values()), len(jit_options), # TODO: support library options [], [], 0) - else: + else: # "old" backend args = (module, len(jit_options), list(jit_options.keys()), list(jit_options.values())) self._handle = handle_return(self._loader["data"](*args)) @@ -95,3 +95,5 @@ def get_kernel(self, name): name = name.encode() data = handle_return(self._loader["kernel"](self._handle, name)) return Kernel._from_obj(data, self) + + # TODO: implement from_handle() From 7587684f2d7f3a0d04636271cb5401cb3dbd3cf9 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 25 Oct 2024 18:54:46 -0700 Subject: [PATCH 4/7] propagate py/driver ver check to launch --- cuda_core/cuda/core/experimental/_launcher.py | 9 ++++++--- cuda_core/cuda/core/experimental/_module.py | 10 ++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 4951358d..b1223365 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -76,8 +76,11 @@ def launch(kernel, config, *kernel_args): kernel_args = ParamHolder(kernel_args) args_ptr = kernel_args.ptr - driver_ver = handle_return(cuda.cuDriverGetVersion()) - if driver_ver >= 12000: + # Note: CUkernel can still be launched via the old cuLaunchKernel. We check ._backend + # here not because of the CUfunction/CUkernel difference (which depends on whether the + # "old" or "new" module loading APIs are in use), but only as a proxy to check if + # both binding & driver versions support the "Ex" API, which is more feature rich. + if kernel._backend == "new": drv_cfg = cuda.CUlaunchConfig() drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block @@ -86,7 +89,7 @@ def launch(kernel, config, *kernel_args): drv_cfg.numAttrs = 0 # TODO handle_return(cuda.cuLaunchKernelEx( drv_cfg, int(kernel._handle), args_ptr, 0)) - else: + else: # "old" backend # TODO: check if config has any unsupported attrs handle_return(cuda.cuLaunchKernel( int(kernel._handle), diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index a51ab24f..e5d0808f 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -32,18 +32,19 @@ class Kernel: - __slots__ = ("_handle", "_module",) + __slots__ = ("_handle", "_module", "_backend") def __init__(self): raise NotImplementedError("directly constructing a Kernel instance is not supported") @staticmethod - def _from_obj(obj, mod): + def _from_obj(obj, mod, backend): assert isinstance(obj, _kernel_ctypes) assert isinstance(mod, ObjectCode) ker = Kernel.__new__(Kernel) ker._handle = obj ker._module = mod + ker._backend = backend return ker # TODO: implement from_handle() @@ -51,7 +52,7 @@ def _from_obj(obj, mod): class ObjectCode: - __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map") + __slots__ = ("_handle", "_code_type", "_module", "_loader", "_loader_backend", "_sym_map") _supported_code_type = ("cubin", "ptx", "fatbin") def __init__(self, module, code_type, jit_options=None, *, @@ -62,6 +63,7 @@ def __init__(self, module, code_type, jit_options=None, *, backend = "new" if (py_major_ver >= 12 and driver_ver >= 12000) else "old" self._loader = _backend[backend] + self._loader_backend = backend if isinstance(module, str): if driver_ver < 12000 and jit_options is not None: @@ -94,6 +96,6 @@ def get_kernel(self, name): except KeyError: name = name.encode() data = handle_return(self._loader["kernel"](self._handle, name)) - return Kernel._from_obj(data, self) + return Kernel._from_obj(data, self, self._loader_backend) # TODO: implement from_handle() From 7fd8ccb85aa542be4f3d68b2dce03931eeef94d6 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 25 Oct 2024 19:41:59 -0700 Subject: [PATCH 5/7] nit: cleaner treatment --- cuda_core/cuda/core/experimental/_launcher.py | 21 +++++++++++++------ cuda_core/cuda/core/experimental/_module.py | 10 ++++----- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index b1223365..c3af8866 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE from dataclasses import dataclass +import importlib.metadata from typing import Optional, Union import numpy as np @@ -64,6 +65,13 @@ def _cast_to_3_tuple(self, cfg): raise ValueError +# binding availability depends on cuda-python version +py_major_minor = tuple(int(v) for v in ( + importlib.metadata.version("cuda-python").split(".")[:2])) +driver_ver = handle_return(cuda.cuDriverGetVersion()) +use_ex = (driver_ver >= 11080) and (py_major_minor >= (11, 8)) + + def launch(kernel, config, *kernel_args): if not isinstance(kernel, Kernel): raise ValueError @@ -76,11 +84,12 @@ def launch(kernel, config, *kernel_args): kernel_args = ParamHolder(kernel_args) args_ptr = kernel_args.ptr - # Note: CUkernel can still be launched via the old cuLaunchKernel. We check ._backend - # here not because of the CUfunction/CUkernel difference (which depends on whether the - # "old" or "new" module loading APIs are in use), but only as a proxy to check if - # both binding & driver versions support the "Ex" API, which is more feature rich. - if kernel._backend == "new": + # Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care + # about the CUfunction/CUkernel difference (which depends on whether the "old" or + # "new" module loading APIs are in use). We check both binding & driver versions here + # mainly to see if the "Ex" API is available and if so we use it, as it's more feature + # rich. + if use_ex: drv_cfg = cuda.CUlaunchConfig() drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block @@ -89,7 +98,7 @@ def launch(kernel, config, *kernel_args): drv_cfg.numAttrs = 0 # TODO handle_return(cuda.cuLaunchKernelEx( drv_cfg, int(kernel._handle), args_ptr, 0)) - else: # "old" backend + else: # TODO: check if config has any unsupported attrs handle_return(cuda.cuLaunchKernel( int(kernel._handle), diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index e5d0808f..a51ab24f 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -32,19 +32,18 @@ class Kernel: - __slots__ = ("_handle", "_module", "_backend") + __slots__ = ("_handle", "_module",) def __init__(self): raise NotImplementedError("directly constructing a Kernel instance is not supported") @staticmethod - def _from_obj(obj, mod, backend): + def _from_obj(obj, mod): assert isinstance(obj, _kernel_ctypes) assert isinstance(mod, ObjectCode) ker = Kernel.__new__(Kernel) ker._handle = obj ker._module = mod - ker._backend = backend return ker # TODO: implement from_handle() @@ -52,7 +51,7 @@ def _from_obj(obj, mod, backend): class ObjectCode: - __slots__ = ("_handle", "_code_type", "_module", "_loader", "_loader_backend", "_sym_map") + __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map") _supported_code_type = ("cubin", "ptx", "fatbin") def __init__(self, module, code_type, jit_options=None, *, @@ -63,7 +62,6 @@ def __init__(self, module, code_type, jit_options=None, *, backend = "new" if (py_major_ver >= 12 and driver_ver >= 12000) else "old" self._loader = _backend[backend] - self._loader_backend = backend if isinstance(module, str): if driver_ver < 12000 and jit_options is not None: @@ -96,6 +94,6 @@ def get_kernel(self, name): except KeyError: name = name.encode() data = handle_return(self._loader["kernel"](self._handle, name)) - return Kernel._from_obj(data, self, self._loader_backend) + return Kernel._from_obj(data, self) # TODO: implement from_handle() From b634d700141fb1c754a87e66bcaaed420d4bd887 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 28 Oct 2024 04:00:48 +0000 Subject: [PATCH 6/7] try to defer driver loading --- cuda_core/cuda/core/experimental/_launcher.py | 30 +++++++++---- cuda_core/cuda/core/experimental/_module.py | 44 +++++++++++++------ 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index c3af8866..4b9533cb 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -16,6 +16,25 @@ from cuda.core.experimental._utils import CUDAError, check_or_create_options, handle_return +# TODO: revisit this treatment for py313t builds +_inited = False +_use_ex = None + + +def _lazy_init(): + global _inited + if _inited: + return + + global _use_ex + # binding availability depends on cuda-python version + _py_major_minor = tuple(int(v) for v in ( + importlib.metadata.version("cuda-python").split(".")[:2])) + _driver_ver = handle_return(cuda.cuDriverGetVersion()) + _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) + _inited = True + + @dataclass class LaunchConfig: """ @@ -41,6 +60,8 @@ def __post_init__(self): if self.shmem_size is None: self.shmem_size = 0 + _lazy_init() + def _cast_to_3_tuple(self, cfg): if isinstance(cfg, int): if cfg < 1: @@ -65,13 +86,6 @@ def _cast_to_3_tuple(self, cfg): raise ValueError -# binding availability depends on cuda-python version -py_major_minor = tuple(int(v) for v in ( - importlib.metadata.version("cuda-python").split(".")[:2])) -driver_ver = handle_return(cuda.cuDriverGetVersion()) -use_ex = (driver_ver >= 11080) and (py_major_minor >= (11, 8)) - - def launch(kernel, config, *kernel_args): if not isinstance(kernel, Kernel): raise ValueError @@ -89,7 +103,7 @@ def launch(kernel, config, *kernel_args): # "new" module loading APIs are in use). We check both binding & driver versions here # mainly to see if the "Ex" API is available and if so we use it, as it's more feature # rich. - if use_ex: + if _use_ex: drv_cfg = cuda.CUlaunchConfig() drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index a51ab24f..2e1ca6e9 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -16,18 +16,33 @@ }, } -# binding availability depends on cuda-python version -py_major_ver = int(importlib.metadata.version("cuda-python").split(".")[0]) -if py_major_ver >= 12: - _backend["new"] = { - "file": cuda.cuLibraryLoadFromFile, - "data": cuda.cuLibraryLoadData, - "kernel": cuda.cuLibraryGetKernel, - } - _kernel_ctypes = (cuda.CUfunction, cuda.CUkernel) -else: - _kernel_ctypes = (cuda.CUfunction,) -driver_ver = handle_return(cuda.cuDriverGetVersion()) + +# TODO: revisit this treatment for py313t builds +_inited = False +_py_major_ver = None +_driver_ver = None +_kernel_ctypes = None + + +def _lazy_init(): + global _inited + if _inited: + return + + global _py_major_ver, _driver_ver, _kernel_ctypes + # binding availability depends on cuda-python version + _py_major_ver = int(importlib.metadata.version("cuda-python").split(".")[0]) + if _py_major_ver >= 12: + _backend["new"] = { + "file": cuda.cuLibraryLoadFromFile, + "data": cuda.cuLibraryLoadData, + "kernel": cuda.cuLibraryGetKernel, + } + _kernel_ctypes = (cuda.CUfunction, cuda.CUkernel) + else: + _kernel_ctypes = (cuda.CUfunction,) + _driver_ver = handle_return(cuda.cuDriverGetVersion()) + _inited = True class Kernel: @@ -58,13 +73,14 @@ def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None): if code_type not in self._supported_code_type: raise ValueError + _lazy_init() self._handle = None - backend = "new" if (py_major_ver >= 12 and driver_ver >= 12000) else "old" + backend = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" self._loader = _backend[backend] if isinstance(module, str): - if driver_ver < 12000 and jit_options is not None: + if _driver_ver < 12000 and jit_options is not None: raise ValueError module = module.encode() self._handle = handle_return(self._loader["file"](module)) From 74de685475cdc1c0d114de10552a94e9de88f53b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 28 Oct 2024 18:07:47 -0400 Subject: [PATCH 7/7] ignore jit_options unconditionally for now when loading from a file --- cuda_core/cuda/core/experimental/_module.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 2e1ca6e9..60d4db97 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -80,7 +80,9 @@ def __init__(self, module, code_type, jit_options=None, *, self._loader = _backend[backend] if isinstance(module, str): - if _driver_ver < 12000 and jit_options is not None: + # TODO: this option is only taken by the new library APIs, but we have + # a bug that we can't easily support it just yet (NVIDIA/cuda-python#73). + if jit_options is not None: raise ValueError module = module.encode() self._handle = handle_return(self._loader["file"](module))