From 4445bc2543f7eff69a91392a41199795c6035dfd Mon Sep 17 00:00:00 2001 From: Tim Martin <38798827+tmartin-gh@users.noreply.github.com> Date: Tue, 17 Dec 2024 13:54:41 -0800 Subject: [PATCH] Python integration sample (#812) * Added DLPack make_tensor * Add a self contained python calling MatX (calling python calling MatX) integration example --------- Co-authored-by: cliffburdick --- examples/CMakeLists.txt | 10 +- .../python_integration_sample/CMakeLists.txt | 68 ++++++ .../example_matxutil.py | 77 ++++++ .../python_integration_sample/matxutil.cu | 231 ++++++++++++++++++ .../python_integration_sample/mypythonlib.py | 15 ++ include/matx/core/make_tensor.h | 133 ++++++++++ include/matx/core/tensor.h | 3 +- test/00_tensor/BasicTensorTests.cu | 2 +- 8 files changed, 533 insertions(+), 6 deletions(-) create mode 100644 examples/python_integration_sample/CMakeLists.txt create mode 100644 examples/python_integration_sample/example_matxutil.py create mode 100644 examples/python_integration_sample/matxutil.cu create mode 100644 examples/python_integration_sample/mypythonlib.py diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d56f099b..2397c509 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,15 +20,15 @@ set(examples print_styles) - - + + add_library(example_lib INTERFACE) target_include_directories(example_lib SYSTEM INTERFACE ${CUTLASS_INC} ${pybind11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS}) target_link_libraries(example_lib INTERFACE matx::matx) # Transitive properties set_property(TARGET example_lib PROPERTY ENABLE_EXPORTS 1) - + if(eigen_DIR) include_directories(SYSTEM ${eigen_DIR}) add_definitions(-DUSE_EIGEN) @@ -64,3 +64,7 @@ endforeach() # Add host-compiler only example program to catch missing ifdef __CUDACC__ guards add_executable(test_host test_host.cpp) target_link_libraries(test_host matx::matx) + +if(MATX_EN_PYBIND11) + add_subdirectory(python_integration_sample) +endif() \ No newline at end of file diff --git a/examples/python_integration_sample/CMakeLists.txt b/examples/python_integration_sample/CMakeLists.txt new file mode 100644 index 00000000..44208892 --- /dev/null +++ b/examples/python_integration_sample/CMakeLists.txt @@ -0,0 +1,68 @@ +# This is a cmake project showing how to build a python importable library +# using pybind11, how to pass tensors between MatX and python, and +# how to call MatX operators from python + +cmake_minimum_required(VERSION 3.26) + +if(NOT DEFINED CMAKE_BUILD_TYPE) + message(WARNING "CMAKE_BUILD_TYPE not defined. Defaulting to release.") + set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type: Debug;Release;MinSizeRel;RelWithDebInfo") +endif() + +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + message(WARNING "CMAKE_CUDA_ARCHITECTURES not defined. Defaulting to 70") + set(CMAKE_CUDA_ARCHITECTURES 70 CACHE STRING "Select compile target CUDA Compute Capabilities") +endif() + +if(NOT DEFINED MATX_FETCH_REMOTE) + message(WARNING "MATX_FETCH_REMOTE not defined. Defaulting to OFF, will use local MatX repo") + set(MATX_FETCH_REMOTE OFF CACHE BOOL "Set MatX repo fetch location") +endif() + +project(SAMPLE_MATX_PYTHON LANGUAGES CUDA CXX) +find_package(CUDAToolkit 12.2 REQUIRED) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Must enable pybind11 support +set(MATX_EN_PYBIND11 ON) + +# Use this section if you want to configure other MatX options +#set(MATX_EN_VISUALIZATION ON) # Uncomment to enable visualizations +#set(MATX_EN_FILEIO ON) # Uncomment to file IO + +# Skip recursive MatX fetch +if(MATX_BUILD_EXAMPLES) +else() + if(MATX_FETCH_REMOTE) + include(FetchContent) + FetchContent_Declare( + MatX + GIT_REPOSITORY https://github.com/NVIDIA/MatX.git + GIT_TAG main + ) + else() + include(FetchContent) + FetchContent_Declare( + MatX + SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../ + ) + endif() + FetchContent_MakeAvailable(MatX) +endif() + +add_library(matxutil MODULE matxutil.cu) +target_link_libraries(matxutil PRIVATE matx::matx CUDA::cudart) +set_target_properties(matxutil PROPERTIES SUFFIX ".so" PREFIX "") + +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/mypythonlib.py + ${CMAKE_BINARY_DIR} + COPYONLY +) + +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/example_matxutil.py + ${CMAKE_BINARY_DIR} + COPYONLY +) diff --git a/examples/python_integration_sample/example_matxutil.py b/examples/python_integration_sample/example_matxutil.py new file mode 100644 index 00000000..259be1fb --- /dev/null +++ b/examples/python_integration_sample/example_matxutil.py @@ -0,0 +1,77 @@ +import cupy as cp +import sys + +# Add path . if we built as a stand-alone project +sys.path.append('.') + +# Add path examples/python_integration_sample/ if we built as part of MatX examples +sys.path.append('examples/python_integration_sample/') + +import matxutil + +# Demonstrate dlpack consumption invalidates it for future use +def dlp_usage_error(): + a = cp.empty((3,3), dtype=cp.float32) + dlp = a.toDlpack() + assert(matxutil.check_dlpack_status(dlp) == 0) + a2 = cp.from_dlpack(dlp) # causes dlp to become unused + assert(matxutil.check_dlpack_status(dlp) != 0) + return dlp + +# Demonstrate cupy array stays in scope when returning valid dlp +def scope_okay(): + a = cp.empty((3,3), dtype=cp.float32) + a[1,1] = 2 + dlp = a.toDlpack() + assert(matxutil.check_dlpack_status(dlp) == 0) + return dlp + +#Do all cupy work using the "with stream" context manager +stream = cp.cuda.stream.Stream(non_blocking=True) +with stream: + print("Demonstrate dlpack consumption invalidates it for future use:") + dlp = dlp_usage_error() + assert(matxutil.check_dlpack_status(dlp) != 0) + print(f" dlp capsule name is: {matxutil.get_capsule_name(dlp)}") + print() + + print("Demonstrate cupy array stays in scope when returning valid dlpack:") + dlp = scope_okay() + assert(matxutil.check_dlpack_status(dlp) == 0) + print(f" dlp capsule name is: {matxutil.get_capsule_name(dlp)}") + print() + + print("Print info about the dlpack:") + matxutil.print_dlpack_info(dlp) + print() + + print("Use MatX to print the tensor:") + matxutil.print_float_2D(dlp) + print() + + print("Print current memory usage info:") + gpu_mempool = cp.get_default_memory_pool() + pinned_mempool = cp.get_default_pinned_memory_pool() + print(f" GPU mempool used bytes {gpu_mempool.used_bytes()}") + print(f" Pinned mempool n_free_blocks {pinned_mempool.n_free_blocks()}") + print() + + print("Demonstrate python to C++ to python to C++ calling chain (uses mypythonlib.py):") + # This function calls back into python and executes a from_dlpack, consuming the dlp + matxutil.call_python_example(dlp) + assert(matxutil.check_dlpack_status(dlp) != 0) + del dlp + + print("Demonstrate adding two tensors together using MatX:") + a = cp.array([[1,2,3],[4,5,6],[7,8,9]], dtype=cp.float32) + b = cp.array([[1,2,3],[4,5,6],[7,8,9]], dtype=cp.float32) + c = cp.empty(b.shape, dtype=b.dtype) + + c_dlp = c.toDlpack() + a_dlp = a.toDlpack() + b_dlp = b.toDlpack() + matxutil.add_float_2D(c_dlp, a_dlp, b_dlp, stream.ptr) + stream.synchronize() + print(f"Tensor a {a}") + print(f"Tensor b {b}") + print(f"Tensor c=a+b {c}") diff --git a/examples/python_integration_sample/matxutil.cu b/examples/python_integration_sample/matxutil.cu new file mode 100644 index 00000000..1e2b800b --- /dev/null +++ b/examples/python_integration_sample/matxutil.cu @@ -0,0 +1,231 @@ +//////////////////////////////////////////////////////////////////////////////// +// BSD 3-Clause License +// +// Copyright (c) 2024, NVIDIA Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +///////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include + +namespace py = pybind11; + +const char* get_capsule_name(py::capsule capsule) +{ + return capsule.name(); +} + +typedef DLManagedTensor* PTR_DLManagedTensor; +int attempt_unpack_dlpack(py::capsule dlpack_capsule, PTR_DLManagedTensor& p_dlpack) +{ + const char* capsule_name = dlpack_capsule.name(); + + if (strncmp(capsule_name,"dltensor",8) != 0) + { + fprintf(stderr,"capsule_name %s\n",capsule_name); + return -1; + } + + p_dlpack = static_cast(dlpack_capsule.get_pointer()); + + if (p_dlpack == nullptr) { + fprintf(stderr,"p_dlpack == nullptr\n"); + return -2; + } + + return 0; +} + +int check_dlpack_status(py::capsule dlpack_capsule) +{ + PTR_DLManagedTensor unused; + return attempt_unpack_dlpack(dlpack_capsule, unused); +} + +const char* dlpack_device_type_to_string(DLDeviceType device_type) +{ + switch(device_type) + { + case kDLCPU: return "kDLCPU"; + case kDLCUDA: return "kDLCUDA"; + case kDLCUDAHost: return "kDLCUDAHost"; + case kDLOpenCL: return "kDLOpenCL"; + case kDLVulkan: return "kDLVulkan"; + case kDLMetal: return "kDLMetal"; + case kDLVPI: return "kDLVPI"; + case kDLROCM: return "kDLROCM"; + case kDLROCMHost: return "kDLROCMHost"; + case kDLExtDev: return "kDLExtDev"; + case kDLCUDAManaged: return "kDLCUDAManaged"; + case kDLOneAPI: return "kDLOneAPI"; + case kDLWebGPU: return "kDLWebGPU"; + case kDLHexagon: return "kDLHexagon"; + default: return "Unknown DLDeviceType"; + } +} + +const char* dlpack_code_to_string(uint8_t code) +{ + switch(code) + { + case kDLInt: return "kDLInt"; + case kDLUInt: return "kDLUInt"; + case kDLFloat: return "kDLFloat"; + case kDLOpaqueHandle: return "kDLOpaqueHandle"; + case kDLBfloat: return "kDLBfloat"; + case kDLComplex: return "kDLComplex"; + case kDLBool: return "kDLBool"; + default: return "Unknown DLDataTypeCode"; + } +} + +void print_dlpack_info(py::capsule dlpack_capsule) { + PTR_DLManagedTensor p_tensor; + if (attempt_unpack_dlpack(dlpack_capsule, p_tensor)) + { + fprintf(stderr,"Error: capsule not valid dlpack"); + return; + } + + printf(" data: %p\n",p_tensor->dl_tensor.data); + printf(" device: device_type %s, device_id %d\n", + dlpack_device_type_to_string(p_tensor->dl_tensor.device.device_type), + p_tensor->dl_tensor.device.device_id + ); + printf(" ndim: %d\n",p_tensor->dl_tensor.ndim); + printf(" dtype: code %s, bits %u, lanes %u\n", + dlpack_code_to_string(p_tensor->dl_tensor.dtype.code), + p_tensor->dl_tensor.dtype.bits, + p_tensor->dl_tensor.dtype.lanes + ); + printf(" shape: "); + for (int k=0; kdl_tensor.ndim; k++) + { + printf("%ld, ",p_tensor->dl_tensor.shape[k]); + } + printf("\n"); + printf(" strides: "); + for (int k=0; kdl_tensor.ndim; k++) + { + printf("%ld, ",p_tensor->dl_tensor.strides[k]); + } + printf("\n"); + printf(" byte_offset: %lu\n",p_tensor->dl_tensor.byte_offset); +} + +template +void print(py::capsule dlpack_capsule) +{ + PTR_DLManagedTensor p_tensor; + if (attempt_unpack_dlpack(dlpack_capsule, p_tensor)) + { + fprintf(stderr,"Error: capsule not valid dlpack"); + return; + } + + matx::tensor_t a; + matx::make_tensor(a, *p_tensor); + matx::print(a); +} + +void call_python_example(py::capsule dlpack_capsule) +{ + PTR_DLManagedTensor p_tensor; + if (attempt_unpack_dlpack(dlpack_capsule, p_tensor)) + { + fprintf(stderr,"Error: capsule not valid dlpack"); + return; + } + + matx::tensor_t a; + matx::make_tensor(a, *p_tensor); + + auto pb = matx::detail::MatXPybind{}; + + // Example use of python's print + pybind11::print(" Example use of python's print function from C++: ", 1, 2.0, "three"); + pybind11::print(" The dlpack_capsule is a ", dlpack_capsule); + + auto mypythonlib = pybind11::module_::import("mypythonlib"); + mypythonlib.attr("my_func")(dlpack_capsule); +} + +template +void add(py::capsule capsule_c, py::capsule capsule_a, py::capsule capsule_b, int64_t stream = 0) +{ + PTR_DLManagedTensor p_tensor_c; + PTR_DLManagedTensor p_tensor_a; + PTR_DLManagedTensor p_tensor_b; + + // TODO these should matx throw + if (attempt_unpack_dlpack(capsule_c, p_tensor_c)) + { + fprintf(stderr,"Error: capsule c not valid dlpack\n"); + return; + } + + if (attempt_unpack_dlpack(capsule_a, p_tensor_a)) + { + fprintf(stderr,"Error: capsule a not valid dlpack\n"); + return; + } + + if (attempt_unpack_dlpack(capsule_b, p_tensor_b)) + { + fprintf(stderr,"Error: capsule b not valid dlpack\n"); + return; + } + + matx::tensor_t c; + matx::tensor_t a; + matx::tensor_t b; + matx::make_tensor(c, *p_tensor_c); + matx::make_tensor(a, *p_tensor_a); + matx::make_tensor(b, *p_tensor_b); + + matx::cudaExecutor exec{reinterpret_cast(stream)}; + (c = a + b).run(exec); +} + +PYBIND11_MODULE(matxutil, m) { + m.def("get_capsule_name", &get_capsule_name, "Returns PyCapsule name"); + m.def("print_dlpack_info", &print_dlpack_info, "Print the DLPack tensor metadata"); + m.def("check_dlpack_status", &check_dlpack_status, "Returns 0 if DLPack is valid, negative error code otherwise"); + m.def("print_float_2D", &print, "Prints a float32 2D tensor"); + m.def("call_python_example", &call_python_example, "Example C++ function that calls python code"); + m.def("add_float_2D", + &add, + "Add two float32 2D tensors together", + py::arg("c"), + py::arg("a"), + py::arg("b"), + py::arg("stream") = 0); +} \ No newline at end of file diff --git a/examples/python_integration_sample/mypythonlib.py b/examples/python_integration_sample/mypythonlib.py new file mode 100644 index 00000000..a419e8de --- /dev/null +++ b/examples/python_integration_sample/mypythonlib.py @@ -0,0 +1,15 @@ +import cupy as cp +import sys +sys.path.append('.') +import matxutil + +def my_func(dlp): + print(f" type(dlp) before cp.from_dlpack(): {type(dlp)}") + print(f" dlp capsule name is: {matxutil.get_capsule_name(dlp)}") + a = cp.from_dlpack(dlp) + print(f" type(dlp) after cp.from_dlpack(): {type(dlp)}") + print(f" dlp capsule name is: {matxutil.get_capsule_name(dlp)}") + print(f" type(cp.from_dlPack(dlp)): {type(a)}") + print() + print("Finally, print the tensor we received from MatX using python:") + print(a) diff --git a/include/matx/core/make_tensor.h b/include/matx/core/make_tensor.h index cbd83482..eda351f7 100644 --- a/include/matx/core/make_tensor.h +++ b/include/matx/core/make_tensor.h @@ -36,6 +36,7 @@ #include "matx/core/nvtx.h" #include "matx/core/storage.h" #include "matx/core/tensor_desc.h" +#include "matx/core/dlpack.h" namespace matx { /** @@ -619,4 +620,136 @@ auto make_static_tensor() { return tensor_t{std::move(s), std::move(desc)}; } +template , bool> = true> +auto make_tensor( TensorType &tensor, + const DLManagedTensor dlp_tensor) { + MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) + + using T = typename TensorType::value_type; + const DLTensor &dt = dlp_tensor.dl_tensor; + + // MatX doesn't track the memory type or device ID, so we don't need to copy it + MATX_ASSERT_STR_EXP(dt.ndim, TensorType::Rank(), matxInvalidDim, "DLPack rank doesn't match MatX rank!"); + + switch (dt.dtype.code) { + case kDLComplex: { + switch (dt.dtype.bits) { + case 128: { + [[maybe_unused]] constexpr bool same = std::is_same_v>; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + case 64: { + [[maybe_unused]] constexpr bool same = std::is_same_v>; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + case 32: { + [[maybe_unused]] constexpr bool same = std::is_same_v || std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + default: + MATX_THROW(matxInvalidSize, "Invalid complex float size from DLPack"); + } + break; + } + + case kDLFloat: { + switch (dt.dtype.bits) { + case 64: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + case 32: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + case 16: { + [[maybe_unused]] constexpr bool same = std::is_same_v || std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + default: + MATX_THROW(matxInvalidSize, "Invalid float size from DLPack"); + } + break; + } + case kDLInt: { + switch (dt.dtype.bits) { + case 64: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + case 32: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + case 16: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + case 8: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + default: + MATX_THROW(matxInvalidSize, "Invalid signed integer size from DLPack"); + } + break; + } + case kDLUInt: { + switch (dt.dtype.bits) { + case 64: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + case 32: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + case 16: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + case 8: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + default: + MATX_THROW(matxInvalidSize, "Invalid unsigned integer size from DLPack"); + } + break; + } + case kDLBool: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + break; + } + } + + index_t strides[TensorType::Rank()]; + index_t shape[TensorType::Rank()]; + + for (int r = 0; r < TensorType::Rank(); r++) { + strides[r] = dt.strides[r]; + shape[r] = dt.shape[r]; + } + + auto tmp = make_tensor( + reinterpret_cast(dt.data), shape, strides, false); + tensor.Shallow(tmp); +} + } // namespace matx diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h index 82b7b62b..cf4d5f22 100644 --- a/include/matx/core/tensor.h +++ b/include/matx/core/tensor.h @@ -1441,7 +1441,7 @@ class tensor_t : public detail::tensor_impl_t { * * @returns Pointer to new DLManagedTensorVersioned pointer. The caller must call the deleter function when finished. */ - DLManagedTensor *GetDLPackTensor() const { + DLManagedTensor *ToDlPack() const { auto mt = new DLManagedTensor; DLTensor *t = &mt->dl_tensor; CUpointer_attribute attr[] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL}; @@ -1509,7 +1509,6 @@ class tensor_t : public detail::tensor_impl_t { return mt; } - private: Storage storage_; }; diff --git a/test/00_tensor/BasicTensorTests.cu b/test/00_tensor/BasicTensorTests.cu index 759b3adb..85a5225b 100644 --- a/test/00_tensor/BasicTensorTests.cu +++ b/test/00_tensor/BasicTensorTests.cu @@ -541,7 +541,7 @@ TYPED_TEST(BasicTensorTestsAll, DLPack) using TestType = cuda::std::tuple_element_t<0, TypeParam>; auto t = make_tensor({5,10,20}); - auto dl = t.GetDLPackTensor(); + auto dl = t.ToDlPack(); ASSERT_EQ(dl->dl_tensor.ndim, 3); ASSERT_EQ(dl->dl_tensor.data, t.Data());