From 8e95993595d4c32037023e37d0ddfa9fc05f04bf Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Wed, 14 Jun 2023 05:25:46 -0700 Subject: [PATCH] Add Arrow - Velox conversion support (#4450) Summary: This PR introduces PyVelox functions for the conversion of Arrow Arrays to Velox Vectors and vice-versa. Pull Request resolved: https://github.com/facebookincubator/velox/pull/4450 Reviewed By: bikramSingh91 Differential Revision: D46570398 Pulled By: kgpai fbshipit-source-id: cf0557ad26a568f10866683e59cfa2bd79040579 --- .github/workflows/build_pyvelox.yml | 1 + Makefile | 5 +-- pyvelox/CMakeLists.txt | 3 +- pyvelox/conversion.cpp | 52 +++++++++++++++++++++++++++++ pyvelox/conversion.h | 34 +++++++++++++++++++ pyvelox/pyvelox.cpp | 2 ++ pyvelox/test/test_vector.py | 52 +++++++++++++++++++++++++++-- setup.py | 2 ++ 8 files changed, 146 insertions(+), 5 deletions(-) create mode 100644 pyvelox/conversion.cpp create mode 100644 pyvelox/conversion.h diff --git a/.github/workflows/build_pyvelox.yml b/.github/workflows/build_pyvelox.yml index 362b289d63fa..bb01d8e804ce 100644 --- a/.github/workflows/build_pyvelox.yml +++ b/.github/workflows/build_pyvelox.yml @@ -127,6 +127,7 @@ jobs: cp -R /host${{ github.workspace }}/.ccache /output/.ccache && ccache -s CIBW_ENVIRONMENT_PASS_LINUX: CCACHE_DIR BUILD_VERSION + CIBW_TEST_EXTRAS: "tests" CIBW_TEST_COMMAND: "cd {project}/pyvelox && python -m unittest -v" CIBW_TEST_SKIP: "*macos*" CCACHE_DIR: "${{ matrix.os != 'macos-11' && '/output' || github.workspace }}/.ccache" diff --git a/Makefile b/Makefile index f161e93a7f40..294b3a4d0650 100644 --- a/Makefile +++ b/Makefile @@ -159,7 +159,8 @@ python-clean: DEBUG=1 ${PYTHON_EXECUTABLE} setup.py clean python-build: - DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=4 ${PYTHON_EXECUTABLE} setup.py develop + DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=4 ${PYTHON_EXECUTABLE} -m pip install -e .$(extras) --verbose -python-test: python-build +python-test: + $(MAKE) python-build extras="[tests]" DEBUG=1 ${PYTHON_EXECUTABLE} -m unittest -v diff --git a/pyvelox/CMakeLists.txt b/pyvelox/CMakeLists.txt index 3b40a43176c2..92d4adbef213 100644 --- a/pyvelox/CMakeLists.txt +++ b/pyvelox/CMakeLists.txt @@ -17,7 +17,8 @@ if(VELOX_BUILD_PYTHON_PACKAGE) include_directories(SYSTEM ${CMAKE_SOURCE_DIR}) add_definitions(-DCREATE_PYVELOX_MODULE -DVELOX_DISABLE_GOOGLETEST) # Define our Python module: - pybind11_add_module(pyvelox MODULE pyvelox.cpp serde.cpp signatures.cpp) + pybind11_add_module(pyvelox MODULE pyvelox.cpp serde.cpp signatures.cpp + conversion.cpp) # Link with Velox: target_link_libraries( pyvelox diff --git a/pyvelox/conversion.cpp b/pyvelox/conversion.cpp new file mode 100644 index 000000000000..d5766865a682 --- /dev/null +++ b/pyvelox/conversion.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "conversion.h" +#include +#include +#include "context.h" + +namespace facebook::velox::py { + +namespace py = pybind11; + +void addConversionBindings(py::module& m, bool asModuleLocalDefinitions) { + m.def("export_to_arrow", [](VectorPtr& inputVector) { + auto arrowArray = std::make_unique(); + auto pool_ = PyVeloxContext::getSingletonInstance().pool(); + facebook::velox::exportToArrow(inputVector, *arrowArray, pool_); + + auto arrowSchema = std::make_unique(); + facebook::velox::exportToArrow(inputVector, *arrowSchema); + + py::module arrow_module = py::module::import("pyarrow"); + py::object array_class = arrow_module.attr("Array"); + return array_class.attr("_import_from_c")( + reinterpret_cast(arrowArray.get()), + reinterpret_cast(arrowSchema.get())); + }); + + m.def("import_from_arrow", [](py::object inputArrowArray) { + auto arrowArray = std::make_unique(); + auto arrowSchema = std::make_unique(); + inputArrowArray.attr("_export_to_c")( + reinterpret_cast(arrowArray.get()), + reinterpret_cast(arrowSchema.get())); + auto pool_ = PyVeloxContext::getSingletonInstance().pool(); + return importFromArrowAsOwner(*arrowSchema, *arrowArray, pool_); + }); +} +} // namespace facebook::velox::py diff --git a/pyvelox/conversion.h b/pyvelox/conversion.h new file mode 100644 index 000000000000..0b0e2d394bc1 --- /dev/null +++ b/pyvelox/conversion.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace facebook::velox::py { + +namespace py = pybind11; + +/// Adds bindings for arrow-velox conversion functions to module m. +/// +/// @param m Module to add bindings to. +/// @param asModuleLocalDefinitions If true then these bindings are only +/// visible inside the module. Refer to +/// https://pybind11.readthedocs.io/en/stable/advanced/classes.html#module-local-class-bindings +/// for further details. +void addConversionBindings(py::module& m, bool asModuleLocalDefinitions = true); + +} // namespace facebook::velox::py diff --git a/pyvelox/pyvelox.cpp b/pyvelox/pyvelox.cpp index 838bd7557049..495cff70e7ff 100644 --- a/pyvelox/pyvelox.cpp +++ b/pyvelox/pyvelox.cpp @@ -15,6 +15,7 @@ */ #include "pyvelox.h" +#include "conversion.h" #include "serde.h" #include "signatures.h" @@ -294,6 +295,7 @@ PYBIND11_MODULE(pyvelox, m) { addVeloxBindings(m); addSignatureBindings(m); addSerdeBindings(m); + addConversionBindings(m); m.attr("__version__") = "dev"; } #endif diff --git a/pyvelox/test/test_vector.py b/pyvelox/test/test_vector.py index 74a352aafcc6..c2a1d271844f 100644 --- a/pyvelox/test/test_vector.py +++ b/pyvelox/test/test_vector.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import unittest - +import pyarrow as pa import pyvelox.pyvelox as pv +import unittest class TestVeloxVector(unittest.TestCase): @@ -273,3 +273,51 @@ def test_slice(self): with self.assertRaises(NotImplementedError): e = a[3:8:3] + + def test_export_to_arrow(self): + test_cases = [ + ([1, 2, 3], pa.int64()), + ([1.1, 2.2, 3.3], pa.float64()), + (["ab", "bc", "ca"], pa.string()), + ] + for data, expected_type in test_cases: + with self.subTest(data=data): + vector = pv.from_list(data) + array = pv.export_to_arrow(vector) + + self.assertEqual(array.type, expected_type) + self.assertEqual(len(array), len(data)) + self.assertListEqual(array.tolist(), data) + + def test_import_from_arrow(self): + test_cases = [ + ([11, 26, 31], pa.int64(), pv.IntegerType()), + ([0.1, 2.5, 3.9], pa.float64(), pv.DoubleType()), + (["az", "by", "cx"], pa.string(), pv.VarcharType()), + ] + for data, dtype, expected_type in test_cases: + with self.subTest(data=data): + array = pa.array(data, type=dtype) + velox_vector = pv.import_from_arrow(array) + + self.assertEqual(velox_vector.size(), len(data)) + self.assertTrue(velox_vector.dtype(), expected_type) + for i in range(0, len(data)): + self.assertEqual(velox_vector[i], data[i]) + + def test_roundtrip_conversion(self): + test_cases = [ + ([41, 92, 13], pv.IntegerType()), + ([17.19, 22.25, 13.3], pv.DoubleType()), + (["aa1", "bb2", "cc3"], pv.VarcharType()), + ] + for data, expected_type in test_cases: + with self.subTest(data=data): + vector = pv.from_list(data) + array = pv.export_to_arrow(vector) + + velox_vector = pv.import_from_arrow(array) + self.assertEqual(velox_vector.size(), len(data)) + self.assertTrue(velox_vector.dtype(), expected_type) + for i in range(0, len(data)): + self.assertEqual(velox_vector[i], data[i]) diff --git a/setup.py b/setup.py index 7f18d8e1f1db..8ec26c7d3aab 100644 --- a/setup.py +++ b/setup.py @@ -172,7 +172,9 @@ def build_extension(self, ext): "typing", "tabulate", "typing-inspect", + "pyarrow", ], + extras_require={"tests": ["pyarrow"]}, python_requires=">=3.7", classifiers=[ "Intended Audience :: Developers",