From 569e48c6231100571265800b00f44b3a7e9d80b7 Mon Sep 17 00:00:00 2001 From: Tim Paine <3105306+timkpaine@users.noreply.github.com> Date: Fri, 23 Feb 2024 12:02:53 -0500 Subject: [PATCH 1/4] WIP move to pycapsule --- CMakeLists.txt | 46 +- README.md | 2 +- arrow_python_nocopy/__init__.py | 26 +- cmake/modules/FindArrow.cmake | 37 - cmake/modules/FindThrift.cmake | 40 - pyproject.toml | 7 +- setup.py | 25 +- src/apn-python/caster.hpp | 130 +- src/apn-python/common.cpp | 54 - src/apn-python/common.hpp | 14 - src/apn-python/cpython.cpp | 101 +- src/apn-python/cpython.hpp | 20 +- src/apn-python/pybind11.cpp | 2 +- src/apn-python/pybind11.hpp | 37 +- .../arrow/python/CMakeLists.txt | 18 - .../apache-arrow-12.0.1/arrow/python/api.h | 30 - .../arrow/python/arrow_to_pandas.cc | 2575 ----------------- .../arrow/python/arrow_to_pandas.h | 146 - .../arrow/python/arrow_to_python_internal.h | 49 - .../arrow/python/benchmark.cc | 38 - .../arrow/python/benchmark.h | 36 - .../arrow/python/common.cc | 203 -- .../apache-arrow-12.0.1/arrow/python/common.h | 366 --- .../apache-arrow-12.0.1/arrow/python/csv.cc | 62 - .../apache-arrow-12.0.1/arrow/python/csv.h | 42 - .../arrow/python/datetime.cc | 663 ----- .../arrow/python/datetime.h | 231 -- .../arrow/python/decimal.cc | 246 -- .../arrow/python/decimal.h | 128 - .../arrow/python/deserialize.cc | 495 ---- .../arrow/python/deserialize.h | 106 - .../arrow/python/extension_type.cc | 217 -- .../arrow/python/extension_type.h | 85 - .../arrow/python/filesystem.cc | 206 -- .../arrow/python/filesystem.h | 126 - .../arrow/python/flight.cc | 388 --- .../apache-arrow-12.0.1/arrow/python/flight.h | 350 --- .../apache-arrow-12.0.1/arrow/python/gdb.cc | 530 ---- .../apache-arrow-12.0.1/arrow/python/gdb.h | 29 - .../arrow/python/helpers.cc | 470 --- .../arrow/python/helpers.h | 159 - .../arrow/python/inference.cc | 748 ----- .../arrow/python/inference.h | 64 - .../apache-arrow-12.0.1/arrow/python/init.cc | 24 - .../apache-arrow-12.0.1/arrow/python/init.h | 26 - .../apache-arrow-12.0.1/arrow/python/io.cc | 384 --- .../apache-arrow-12.0.1/arrow/python/io.h | 121 - .../apache-arrow-12.0.1/arrow/python/ipc.cc | 67 - .../apache-arrow-12.0.1/arrow/python/ipc.h | 52 - .../arrow/python/iterators.h | 194 -- .../apache-arrow-12.0.1/arrow/python/lib.h | 63 - .../arrow/python/lib_api.h | 222 -- .../arrow/python/numpy_convert.cc | 562 ---- .../arrow/python/numpy_convert.h | 120 - .../arrow/python/numpy_internal.h | 182 -- .../arrow/python/numpy_interop.h | 96 - .../arrow/python/numpy_to_arrow.cc | 870 ------ .../arrow/python/numpy_to_arrow.h | 72 - .../arrow/python/parquet_encryption.cc | 98 - .../arrow/python/parquet_encryption.h | 109 - .../apache-arrow-12.0.1/arrow/python/pch.h | 24 - .../arrow/python/platform.h | 41 - .../arrow/python/pyarrow.cc | 94 - .../arrow/python/pyarrow.h | 84 - .../arrow/python/pyarrow_api.h | 19 - .../arrow/python/pyarrow_lib.h | 19 - .../arrow/python/python_test.cc | 888 ------ .../arrow/python/python_test.h | 42 - .../arrow/python/python_to_arrow.cc | 1240 -------- .../arrow/python/python_to_arrow.h | 80 - .../arrow/python/serialize.cc | 798 ----- .../arrow/python/serialize.h | 145 - .../arrow/python/type_traits.h | 350 --- .../apache-arrow-12.0.1/arrow/python/udf.cc | 736 ----- .../apache-arrow-12.0.1/arrow/python/udf.h | 76 - .../arrow/python/visibility.h | 39 - vcpkg.json | 3 +- 77 files changed, 213 insertions(+), 17074 deletions(-) delete mode 100644 cmake/modules/FindArrow.cmake delete mode 100644 cmake/modules/FindThrift.cmake delete mode 100644 src/apn-python/common.cpp delete mode 100644 src/apn-python/common.hpp delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/CMakeLists.txt delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/api.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_python_internal.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/common.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/common.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/csv.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/csv.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/datetime.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/datetime.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/decimal.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/decimal.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/flight.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/flight.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/gdb.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/gdb.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/helpers.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/helpers.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/inference.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/inference.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/init.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/init.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/io.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/io.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/ipc.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/ipc.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/iterators.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/lib.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/lib_api.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/numpy_internal.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/numpy_interop.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/pch.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/platform.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow_api.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow_lib.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/python_test.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/python_test.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/serialize.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/serialize.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/type_traits.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/udf.cc delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/udf.h delete mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/visibility.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7045a82..23ef99b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,7 +175,7 @@ message("${Red}${CMAKE_MODULE_PATH}${ColorReset}") # broken on mac -find_package(Arrow REQUIRED) +find_package(Arrow CONFIG REQUIRED) include_directories(${Arrow_INCLUDE_DIR}) ############################################################################################################### @@ -257,43 +257,12 @@ if(BUILD_PYTHON) # Find PyArrow. We will link against it for the build only # find_package(Pyarrow REQUIRED) # include_directories(${PYARROW_INCLUDE_DIR}) - + # Find PyBind11 - find_package(pybind11 REQUIRED) + find_package(pybind11 CONFIG REQUIRED) include_directories(${pybind11_INCLUDE_DIR}) - set(VENDORED_PYARROW_ROOT "${PROJECT_SOURCE_DIR}/src/vendored/apache-arrow-12.0.1") - include_directories(${VENDORED_PYARROW_ROOT}) - set(VENDORED_PYARROW_SRCS - # ${VENDORED_PYARROW_ROOT}/arrow/python/arrow_to_pandas.cc - # ${VENDORED_PYARROW_ROOT}/arrow/python/benchmark.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/common.cc - # ${VENDORED_PYARROW_ROOT}/arrow/python/csv.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/datetime.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/decimal.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/deserialize.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/extension_type.cc - # ${VENDORED_PYARROW_ROOT}/arrow/python/filesystem.cc - # ${VENDORED_PYARROW_ROOT}/arrow/python/flight.cc - # ${VENDORED_PYARROW_ROOT}/arrow/python/gdb.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/helpers.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/inference.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/init.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/io.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/ipc.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/numpy_convert.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/numpy_to_arrow.cc - # ${VENDORED_PYARROW_ROOT}/arrow/python/parquet_encryption.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/pyarrow.cc - # ${VENDORED_PYARROW_ROOT}/arrow/python/python_test.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/python_to_arrow.cc - ${VENDORED_PYARROW_ROOT}/arrow/python/serialize.cc - # ${VENDORED_PYARROW_ROOT}/arrow/python/udf.cc - ) - - # common functionality - add_library(common SHARED "${PROJECT_SOURCE_DIR}/src/apn-python/common.cpp" ${VENDORED_PYARROW_SRCS}) - set_target_properties(common PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/apn-python/common.hpp") + # common functionality # pybind11 extension pybind11_add_module(pybind11extension MODULE "${PROJECT_SOURCE_DIR}/src/apn-python/pybind11.cpp") set_target_properties(pybind11extension PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/apn-python/pybind11.hpp") @@ -303,16 +272,13 @@ if(BUILD_PYTHON) set_target_properties(cpythonextension PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/apn-python/cpython.hpp") # Link to standalone/common library - target_link_libraries(common PRIVATE arrow-python-nocopy) - target_link_libraries(pybind11extension PRIVATE common) - target_link_libraries(cpythonextension PRIVATE common) - set_property(TARGET common PROPERTY INSTALL_RPATH "${module_origin_path}") + target_link_libraries(pybind11extension PRIVATE arrow-python-nocopy) + target_link_libraries(cpythonextension PRIVATE arrow-python-nocopy) set_property(TARGET pybind11extension PROPERTY INSTALL_RPATH "${module_origin_path}") set_property(TARGET cpythonextension PROPERTY INSTALL_RPATH "${module_origin_path}") set_property(TARGET cpythonextension PROPERTY PREFIX "") # install in python module - install(TARGETS common EXPORT ArrowPythonNocopy LIBRARY DESTINATION lib PUBLIC_HEADER DESTINATION include/python) install(TARGETS pybind11extension EXPORT ArrowPythonNocopy LIBRARY DESTINATION lib PUBLIC_HEADER DESTINATION include/python) install(TARGETS cpythonextension EXPORT ArrowPythonNocopy LIBRARY DESTINATION lib PUBLIC_HEADER DESTINATION include/python) endif() diff --git a/README.md b/README.md index 6d77908..dc360bb 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,4 @@ This is a small example of building a C++ library which reads and returns arrays and schemas linked against `libarrow` (but **DOES NOT REQUIRE** `pyarrow`), and building a python extension that uses this library to pass arrays and schemas back and forth. ## tl;dr -This utilizes Apache Arrow's **ABI-stable C interface**, which would, for example, allow your combined C++/python extension to compile and link against one version of arrow and be utilized with `pyarrow` which would otherwise be ABI-incompatible. +This utilizes Apache Arrow's **ABI-stable C interface** and **PyCapsule interface**, which would, for example, allow your combined C++/python extension to compile and link against one version of arrow and be utilized with `pyarrow` which would otherwise be ABI-incompatible. diff --git a/arrow_python_nocopy/__init__.py b/arrow_python_nocopy/__init__.py index 55da63b..472a0d7 100644 --- a/arrow_python_nocopy/__init__.py +++ b/arrow_python_nocopy/__init__.py @@ -2,31 +2,18 @@ import os.path import pyarrow as pa import pandas as pd -from .lib.pybind11extension import array_info, create_array -from .lib.pybind11extension import schema_info, create_schema - +# from .lib.pybind11extension import array_info, create_array +# from .lib.pybind11extension import schema_info, create_schema # from .lib.pybind11extension import table_info, create_table -from .lib.cpythonextension import array_info as array_info_cp, create_array as create_array_cp -from .lib.cpythonextension import schema_info as schema_info_cp, create_schema as create_schema_cp +from .lib.cpythonextension import array_info, create_array +from .lib.cpythonextension import schema_info, create_schema # from .lib.cpythonextension import table_info as table_info_cp, create_table as create_table_cp __version__ = "0.1.0" -def include_path(): - return os.path.abspath(os.path.join(os.path.dirname(__file__), "include")) - - -def bin_path(): - return os.path.abspath(os.path.join(os.path.dirname(__file__), "bin")) - - -def lib_path(): - return os.path.abspath(os.path.join(os.path.dirname(__file__), "lib")) - - def _df(): return pd.DataFrame({"a": pd.Series([1, 2, 3], dtype='Int32'), "b": pd.Series([1.1, 2.2, 3.3], dtype='Float32'), "c": pd.Series(["abc", "def", "ghi"], dtype=str)}) @@ -42,7 +29,7 @@ def create_arrow_array_in_python(): def create_arrow_array_in_cpp(): - return create_array() + return pa.Array._import_from_c_capsule(create_array()) def create_arrow_schema_in_python(): @@ -52,8 +39,7 @@ def create_arrow_schema_in_python(): def create_arrow_schema_in_cpp(): - return create_schema() - + return pa.Schema._import_from_c_capsule(create_schema()) # def create_arrow_table_in_python(): # table = _table() diff --git a/cmake/modules/FindArrow.cmake b/cmake/modules/FindArrow.cmake deleted file mode 100644 index a47b49d..0000000 --- a/cmake/modules/FindArrow.cmake +++ /dev/null @@ -1,37 +0,0 @@ -# Find Arrow -# This module defines: -# Arrow_INCLUDE_DIR -# Arrow_LIBRARY -# Arrow_LIB_DIR - -find_path(Arrow_INCLUDE_DIR arrow/config.h - PATHS ${Arrow_ROOT}/include - HINTS /usr /usr/include /usr/local /usr/local/include /usr/local/Homebrew /usr/local/Homebrew/include ~/homebrew ~/homebrew/include /opt/homebrew /opt/homebrew/include - NO_CMAKE_SYSTEM_PATH - NO_SYSTEM_ENVIRONMENT_PATH) - -find_path(Arrow_LIB_DIR - NAMES libarrow.a arrow.a libarrow.so arrow.so libarrow.dylib arrow.dylib - PATHS ${Arrow_ROOT}/lib - HINTS /usr /usr/lib /usr/local /usr/local/lib /usr/local/Homebrew /usr/local/Homebrew/lib ~/homebrew/ ~/homebrew/lib /opt/homebrew/ /opt/homebrew/lib /usr/lib/x86_64-linux-gnu - NO_CMAKE_SYSTEM_PATH - NO_SYSTEM_ENVIRONMENT_PATH) - -if(BUILD_SHARED_LIBS) - find_file(Arrow_LIBRARY - NAMES libarrow.so arrow.so libarrow.dylib arrow.dylib - PATHS ${Arrow_ROOT} - HINTS /usr /usr/lib /usr/local /usr/local/lib /usr/local/Homebrew /usr/local/Homebrew/lib ~/homebrew/ ~/homebrew/lib /opt/homebrew/ /opt/homebrew/lib /usr/lib/x86_64-linux-gnu - NO_CMAKE_SYSTEM_PATH - NO_SYSTEM_ENVIRONMENT_PATH) -else() - find_file(Arrow_LIBRARY - NAMES libarrow.a arrow.a - PATHS ${Arrow_ROOT} - HINTS /usr /usr/lib /usr/local /usr/local/lib /usr/local/Homebrew /usr/local/Homebrew/lib ~/homebrew/ ~/homebrew/lib /opt/homebrew/ /opt/homebrew/lib /usr/lib/x86_64-linux-gnu - NO_CMAKE_SYSTEM_PATH - NO_SYSTEM_ENVIRONMENT_PATH) -endif() - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(Arrow REQUIRED_VARS Arrow_INCLUDE_DIR Arrow_LIB_DIR Arrow_LIBRARY) diff --git a/cmake/modules/FindThrift.cmake b/cmake/modules/FindThrift.cmake deleted file mode 100644 index a505e0d..0000000 --- a/cmake/modules/FindThrift.cmake +++ /dev/null @@ -1,40 +0,0 @@ -# Find Thrift -# This module defines: -# Thrift_INCLUDE_DIR -# Thrift_LIBRARY -# Thrift_LIB_DIR - -find_path(Thrift_INCLUDE_DIR thrift/Thrift.h - PATHS ${Thrift_ROOT}/include - HINTS /usr /usr/include /usr/local /usr/local/include /usr/local/Homebrew /usr/local/Homebrew/include ~/homebrew ~/homebrew/include /opt/homebrew /opt/homebrew/include - NO_CMAKE_SYSTEM_PATH - NO_SYSTEM_ENVIRONMENT_PATH) - -find_path(Thrift_LIB_DIR - NAMES libthrift.a libthrift.so libthrift.dylib - PATHS ${Thrift_ROOT}/lib - HINTS /usr /usr/lib /usr/local /usr/local/lib /usr/local/Homebrew /usr/local/Homebrew/lib ~/homebrew/ ~/homebrew/lib /opt/homebrew/ /opt/homebrew/lib - NO_CMAKE_SYSTEM_PATH - NO_SYSTEM_ENVIRONMENT_PATH) - -if(BUILD_SHARED_LIBS) - find_file(Thrift_LIBRARY - NAMES libthrift.so thrift.so libthrift.dylib thrift.dylib - PATHS ${Thrift_ROOT} - HINTS /usr /usr/lib /usr/local /usr/local/lib /usr/local/Homebrew /usr/local/Homebrew/lib ~/homebrew/ ~/homebrew/lib /opt/homebrew/ /opt/homebrew/lib - NO_CMAKE_SYSTEM_PATH - NO_SYSTEM_ENVIRONMENT_PATH) -else() - find_file(Thrift_LIBRARY - NAMES libthrift.a thrift.a - PATHS ${Thrift_ROOT} - HINTS /usr /usr/lib /usr/local /usr/local/lib /usr/local/Homebrew /usr/local/Homebrew/lib ~/homebrew/ ~/homebrew/lib /opt/homebrew/ /opt/homebrew/lib - NO_CMAKE_SYSTEM_PATH - NO_SYSTEM_ENVIRONMENT_PATH) -endif() - -# For apache arrow -set(ThriftAlt_LIB ${Thrift_LIBRARY}) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(Thrift REQUIRED_VARS Thrift_INCLUDE_DIR Thrift_LIB_DIR Thrift_LIBRARY) diff --git a/pyproject.toml b/pyproject.toml index 18dd6f6..60fcee9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,6 @@ requires = [ "cmake", "ninja", - "numpy", "pybind11[global]", "scikit-build", "setuptools", @@ -61,9 +60,9 @@ test = [ ] [project.urls] -Repository = "https://github.com/python-project-templates/cpp" -Homepage = "https://github.com/python-project-templates/cpp" -Issues = "https://github.com/python-project-templates/cpp/issues" +Repository = "https://github.com/timkpaine/arrow-cpp-python-nocopy" +Homepage = "https://github.com/timkpaine/arrow-cpp-python-nocopy" +Issues = "https://github.com/timkpaine/arrow-cpp-python-nocopy/issues" [tool.black] color = true diff --git a/setup.py b/setup.py index 2f34355..1f2bf2d 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,26 @@ +import multiprocessing import os import os.path -import multiprocessing +import subprocess import sys import sysconfig +from shutil import which from skbuild import setup +# This will be used for e.g. the sdist +if not os.path.exists("vcpkg"): + subprocess.call(["git", "clone", "https://github.com/Microsoft/vcpkg.git"]) +if not os.path.exists("vcpkg/ports"): + subprocess.call(["git", "submodule", "update", "--init", "--recursive"]) +if not os.path.exists("vcpkg/buildtrees"): + subprocess.call(["git", "pull"], cwd="vcpkg") + if os.name == "nt": + subprocess.call(["bootstrap-vcpkg.bat"], cwd="vcpkg") + subprocess.call(["vcpkg", "install"], cwd="vcpkg") + else: + subprocess.call(["./bootstrap-vcpkg.sh"], cwd="vcpkg") + subprocess.call(["./vcpkg", "install"], cwd="vcpkg") + python_version = f"{sys.version_info.major}.{sys.version_info.minor}" cmake_args = [f"-DPYTHON_VERSION={python_version}"] vcpkg_config_file = os.path.abspath(os.path.join("vcpkg/scripts/buildsystems/vcpkg.cmake")) @@ -19,8 +35,8 @@ ] ) -# if os.path.exists(vcpkg_config_file) and os.name != "posix": -# cmake_args.append(f"-DCMAKE_TOOLCHAIN_FILE={vcpkg_config_file}") +if os.path.exists(vcpkg_config_file): + cmake_args.append(f"-DCMAKE_TOOLCHAIN_FILE={vcpkg_config_file}") if "DEBUG" in os.environ: cmake_args.append("-DCMAKE_BUILD_TYPE=Debug") @@ -31,6 +47,9 @@ if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ: os.environ["CMAKE_BUILD_PARALLEL_LEVEL"] = str(multiprocessing.cpu_count()) +if which("ccache"): + cmake_args.append("-DUSE_CCACHE=On") + setup( name="arrow_python_nocopy", version="0.1.0", diff --git a/src/apn-python/caster.hpp b/src/apn-python/caster.hpp index 248e265..c4368d5 100644 --- a/src/apn-python/caster.hpp +++ b/src/apn-python/caster.hpp @@ -1,72 +1,72 @@ -#pragma once +// #pragma once -namespace pybind11 { -namespace detail { - template <> - struct type_caster> { - public: - PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Array")); - /* Python->C++ */ - bool load(handle src, bool) { - PyObject* source = src.ptr(); - if(!arrow::py::is_array(source)) - return false; - arrow::Result> result = arrow::py::unwrap_array(source); - if(!result.ok()) - return false; - value = std::static_pointer_cast(result.ValueOrDie()); - return true; - } +// namespace pybind11 { +// namespace detail { +// template <> +// struct type_caster> { +// public: +// PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Array")); +// /* Python->C++ */ +// bool load(handle src, bool) { +// PyObject* source = src.ptr(); +// if(!arrow::py::is_array(source)) +// return false; +// arrow::Result> result = arrow::py::unwrap_array(source); +// if(!result.ok()) +// return false; +// value = std::static_pointer_cast(result.ValueOrDie()); +// return true; +// } - /* C++ -> Python) */ - static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { - return arrow::py::wrap_array(src); - } - }; +// /* C++ -> Python) */ +// static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { +// return arrow::py::wrap_array(src); +// } +// }; - template <> - struct type_caster> { - public: - PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Schema")); - /* Python->C++ */ - bool load(handle src, bool) { - PyObject* source = src.ptr(); - if(!arrow::py::is_schema(source)) - return false; - arrow::Result> result = arrow::py::unwrap_schema(source); - if(!result.ok()) - return false; - value = std::static_pointer_cast(result.ValueOrDie()); - return true; - } +// template <> +// struct type_caster> { +// public: +// PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Schema")); +// /* Python->C++ */ +// bool load(handle src, bool) { +// PyObject* source = src.ptr(); +// if(!arrow::py::is_schema(source)) +// return false; +// arrow::Result> result = arrow::py::unwrap_schema(source); +// if(!result.ok()) +// return false; +// value = std::static_pointer_cast(result.ValueOrDie()); +// return true; +// } - /* C++ -> Python) */ - static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { - return arrow::py::wrap_schema(src); - } - }; +// /* C++ -> Python) */ +// static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { +// return arrow::py::wrap_schema(src); +// } +// }; - template <> - struct type_caster> { - public: - PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Table")); - /* Python->C++ */ - bool load(handle src, bool) { - PyObject* source = src.ptr(); - if(!arrow::py::is_table(source)) - return false; - arrow::Result> result = arrow::py::unwrap_table(source); - if(!result.ok()) - return false; - value = std::static_pointer_cast(result.ValueOrDie()); - return true; - } +// template <> +// struct type_caster> { +// public: +// PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Table")); +// /* Python->C++ */ +// bool load(handle src, bool) { +// PyObject* source = src.ptr(); +// if(!arrow::py::is_table(source)) +// return false; +// arrow::Result> result = arrow::py::unwrap_table(source); +// if(!result.ok()) +// return false; +// value = std::static_pointer_cast(result.ValueOrDie()); +// return true; +// } - /* C++ -> Python) */ - static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { - return arrow::py::wrap_table(src); - } - }; +// /* C++ -> Python) */ +// static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { +// return arrow::py::wrap_table(src); +// } +// }; -} -} +// } +// } diff --git a/src/apn-python/common.cpp b/src/apn-python/common.cpp deleted file mode 100644 index f4e6888..0000000 --- a/src/apn-python/common.cpp +++ /dev/null @@ -1,54 +0,0 @@ -#include -#include -#include -#include - -char* array_info_py(std::shared_ptr array) { - // ABI unstable! - // return array_info(array); - char* buffer = new char[100]; - struct ArrowArray c_array; - (void)ExportArray(*array, &c_array); - array_info_cabi(&c_array, buffer, 100); - return buffer; -} - -std::shared_ptr create_array_py() { - // ABI unstable! - // std::shared_ptr arrow_array = create_array_cabi(); - struct ArrowArray c_array; - create_array_cabi(&c_array); - std::shared_ptr arrow_array = arrow::ImportArray(&c_array, arrow::int32()).ValueOrDie(); - return arrow_array; -} - -char* schema_info_py(std::shared_ptr schema) { - // ABI unstable! - // return schema_info(schema); - char* buffer = new char[100]; - struct ArrowSchema c_schema; - (void)arrow::ExportSchema(*schema, &c_schema); - schema_info_cabi(&c_schema, buffer, 100); - return buffer; -} - -std::shared_ptr create_schema_py() { - // ABI unstable! - // std::shared_ptr arrow_schema = create_schema(); - - struct ArrowSchema c_schema; - create_schema_cabi(&c_schema); - std::shared_ptr arrow_schema = arrow::ImportSchema(&c_schema).ValueOrDie(); - return arrow_schema; -} - -// std::string table_info_py(std::shared_ptr table) { -// return table_info(table); -// } - -// std::shared_ptr create_table_py() { -// std::shared_ptr arrow_table = create_table(); -// return arrow_table; -// // PyObject* obj = arrow::py::wrap_table(arrow_table); -// // return pybind11::cast(obj); -// } diff --git a/src/apn-python/common.hpp b/src/apn-python/common.hpp deleted file mode 100644 index 0564b08..0000000 --- a/src/apn-python/common.hpp +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once -#include -#include -#include -#include - -LIB_EXPORT char* array_info_py(std::shared_ptr array); -LIB_EXPORT std::shared_ptr create_array_py(); - -LIB_EXPORT char* schema_info_py(std::shared_ptr schema); -LIB_EXPORT std::shared_ptr create_schema_py(); - -LIB_EXPORT char* table_info_py(std::shared_ptr table); -LIB_EXPORT std::shared_ptr create_table_py(); diff --git a/src/apn-python/cpython.cpp b/src/apn-python/cpython.cpp index 9fd6284..853a3c2 100644 --- a/src/apn-python/cpython.cpp +++ b/src/apn-python/cpython.cpp @@ -1,53 +1,118 @@ #include +#include +#include static PyObject* _raise_error(PyObject* module) { PyErr_SetString(PyExc_TypeError, "Bad value provided"); return NULL; } -PyObject* array_info_py_raw(PyObject* self, PyObject* args) { +PyObject* array_info_py(PyObject* self, PyObject* args) { PyObject* source; if(!PyArg_ParseTuple(args, "O", &source)) return _raise_error(self); - if(!arrow::py::is_array(source)) + // Old non-pycapsule way, not ABI stable + // and relies on pyarrow + // if(!arrow::py::is_array(source)) + // return _raise_error(self); + // arrow::Result> result = arrow::py::unwrap_array(source); + // if(!result.ok()) + // return _raise_error(self); + // char* ret_str = array_info_py(std::static_pointer_cast(result.ValueOrDie())); + + // New pycapsule way + if(!PyObject_HasAttrString(source, "__arrow_c_array__")) return _raise_error(self); - arrow::Result> result = arrow::py::unwrap_array(source); + // extract the capsule + PyObject* array_capsule = PyObject_CallNoArgs(PyObject_GetAttrString(source, "__arrow_c_array__")); + struct ArrowArray* c_array = (struct ArrowArray*) PyCapsule_GetPointer(array_capsule, "arrow_array"); - if(!result.ok()) - return _raise_error(self); + // Convert C array to C++ array and extract info + // TODO hardcoding datatype here, you would want to also pass in schema in real world + std::shared_ptr arrow_array = arrow::ImportArray(c_array, arrow::int32()).ValueOrDie(); + + // Get info and return + std::string info = array_info(arrow_array); + return PyUnicode_FromStringAndSize(info.c_str(), info.length()); +} - char* ret_str = array_info_py(std::static_pointer_cast(result.ValueOrDie())); - return PyUnicode_FromStringAndSize(ret_str, strlen(ret_str)); +void ReleaseArrowArrayPyCapsule(PyObject* array_capsule) { + struct ArrowArray* c_array = (struct ArrowArray*)PyCapsule_GetPointer(array_capsule, "arrow_array"); + // if (c_array->release != NULL) { + // c_array->release(c_array); + // } + // free(c_array); } -PyObject* create_array_py_raw(PyObject* self, PyObject* args) { - return arrow::py::wrap_array(create_array_py()); +PyObject* create_array_py(PyObject* self, PyObject* args) { + // Old non-pycapsule way + // return arrow::py::wrap_array(create_array_py()); + + // Create array with arrow C++ + std::shared_ptr array = create_array('a'); + + // Convert to C api + struct ArrowArray *c_array = (struct ArrowArray*)malloc(sizeof(struct ArrowArray)); + (void)arrow::ExportArray(*array, c_array); + + // Hoist out to pycapsule + return PyCapsule_New(c_array, "arrow_array", ReleaseArrowArrayPyCapsule); } -PyObject* schema_info_py_raw(PyObject* self, PyObject* args) { +PyObject* schema_info_py(PyObject* self, PyObject* args) { PyObject* source; // parse arguments if(!PyArg_ParseTuple(args, "O", &source)) return _raise_error(self); - if(!arrow::py::is_schema(source)) + // Old non-pycapsule way, not ABI stable + // and relies on pyarrow + // if(!arrow::py::is_schema(source)) + // return _raise_error(self); + // if(!result.ok()) + // return _raise_error(self); + // arrow::Result> result = arrow::py::unwrap_schema(source); + // char* ret_str = schema_info_py(std::static_pointer_cast(result.ValueOrDie())); + + // New pycapsule way + if(!PyObject_HasAttrString(source, "__arrow_c_schema__")) return _raise_error(self); - arrow::Result> result = arrow::py::unwrap_schema(source); + // extract the capsule + PyObject* schema_capsule = PyObject_CallNoArgs(PyObject_GetAttrString(source, "__arrow_c_schema__")); + struct ArrowSchema* c_schema = (struct ArrowSchema*) PyCapsule_GetPointer(schema_capsule, "arrow_schema"); - if(!result.ok()) - return _raise_error(self); + // Convert C schema to C++ schema and extract info + std::shared_ptr arrow_schema = arrow::ImportSchema(c_schema).ValueOrDie(); + std::string info = schema_info(arrow_schema); + return PyUnicode_FromStringAndSize(info.c_str(), info.length()); +} - char* ret_str = schema_info_py(std::static_pointer_cast(result.ValueOrDie())); - return PyUnicode_FromStringAndSize(ret_str, strlen(ret_str)); +void ReleaseArrowSchemaPyCapsule(PyObject* schema_capsule) { + struct ArrowSchema* c_schema = (struct ArrowSchema*)PyCapsule_GetPointer(schema_capsule, "arrow_schema"); + if (c_schema->release != NULL) { + c_schema->release(c_schema); + } + // free(c_schema); } -PyObject* create_schema_py_raw(PyObject* self, PyObject* Py_UNUSED(args)) { - return arrow::py::wrap_schema(create_schema_py()); +PyObject* create_schema_py(PyObject* self, PyObject* Py_UNUSED(args)) { + // Old non-pycapsule way + // return arrow::py::wrap_schema(create_schema_py()); + + // Create array with arrow C++ + std::shared_ptr schema = create_schema(); + + // Convert to C api + struct ArrowSchema* c_schema = (struct ArrowSchema*)malloc(sizeof(struct ArrowSchema)); + (void)arrow::ExportSchema(*schema, c_schema); + + // Hoist out to pycapsule + return PyCapsule_New(c_schema, "arrow_schema", ReleaseArrowSchemaPyCapsule); } // std::string table_info_py_raw(PyObject* source) { diff --git a/src/apn-python/cpython.hpp b/src/apn-python/cpython.hpp index 17ffbcb..68d263d 100644 --- a/src/apn-python/cpython.hpp +++ b/src/apn-python/cpython.hpp @@ -1,21 +1,20 @@ #pragma once #include "Python.h" -#include -#include +#include -LIB_EXPORT PyObject* array_info_py_raw(PyObject*, PyObject*); -LIB_EXPORT PyObject* create_array_py_raw(PyObject*, PyObject*); -LIB_EXPORT PyObject* schema_info_py_raw(PyObject*, PyObject*); -LIB_EXPORT PyObject* create_schema_py_raw(PyObject*, PyObject*); +LIB_EXPORT PyObject* array_info_py(PyObject*, PyObject*); +LIB_EXPORT PyObject* create_array_py(PyObject*, PyObject*); +LIB_EXPORT PyObject* schema_info_py(PyObject*, PyObject*); +LIB_EXPORT PyObject* create_schema_py(PyObject*, PyObject*); // LIB_EXPORT PyObject* table_info_py_raw(PyObject*, PyObject*); // LIB_EXPORT PyObject* create_table_py_raw(PyObject*, PyObject*); static PyMethodDef cpythonextension_methods[] = { - {"array_info", (PyCFunction)array_info_py_raw, METH_VARARGS}, - {"create_array", (PyCFunction)create_array_py_raw, METH_NOARGS}, - {"schema_info", (PyCFunction)schema_info_py_raw, METH_VARARGS}, - {"create_schema", (PyCFunction)create_schema_py_raw, METH_NOARGS}, + {"array_info", (PyCFunction)array_info_py, METH_VARARGS}, + {"create_array", (PyCFunction)create_array_py, METH_NOARGS}, + {"schema_info", (PyCFunction)schema_info_py, METH_VARARGS}, + {"create_schema", (PyCFunction)create_schema_py, METH_NOARGS}, {nullptr, nullptr, 0, nullptr} }; @@ -23,7 +22,6 @@ static PyModuleDef cpythonextension_module = { PyModuleDef_HEAD_INIT, "cpythonextension", "cpython", -1, cpythonextension_methods}; PyMODINIT_FUNC PyInit_cpythonextension(void) { - arrow::py::import_pyarrow(); Py_Initialize(); return PyModule_Create(&cpythonextension_module); } diff --git a/src/apn-python/pybind11.cpp b/src/apn-python/pybind11.cpp index 003774c..564996e 100644 --- a/src/apn-python/pybind11.cpp +++ b/src/apn-python/pybind11.cpp @@ -1,2 +1,2 @@ -#include +// #include diff --git a/src/apn-python/pybind11.hpp b/src/apn-python/pybind11.hpp index 277b16b..a625dfb 100644 --- a/src/apn-python/pybind11.hpp +++ b/src/apn-python/pybind11.hpp @@ -1,24 +1,17 @@ -#pragma once -#include -#include -#include -#include -#include -#include +// #pragma once +// #include +// #include +// #include -namespace py = pybind11; +// namespace py = pybind11; -// LIB_EXPORT PyObject* table_info_py_raw(PyObject*, PyObject*); -PYBIND11_MODULE(pybind11extension, m) { - py::module_::import("pyarrow"); - // dlopen("arrow_python.so", RTLD_LAZY); - // dlopen("libarrow_python.so", RTLD_LAZY); - import_pyarrow__lib(); - m.doc() = "pybind11"; - m.def("array_info", &array_info_py); - m.def("create_array", &create_array_py); - m.def("schema_info", &schema_info_py); - m.def("create_schema", &create_schema_py); - // m.def("table_info", &table_info_py, ""); - // m.def("create_table", &create_table_py, ""); -} +// // LIB_EXPORT PyObject* table_info_py_raw(PyObject*, PyObject*); +// PYBIND11_MODULE(pybind11extension, m) { +// m.doc() = "pybind11"; +// m.def("array_info", &array_info_py); +// m.def("create_array", &create_array_py); +// m.def("schema_info", &schema_info_py); +// m.def("create_schema", &create_schema_py); +// // m.def("table_info", &table_info_py, ""); +// // m.def("create_table", &create_table_py, ""); +// } diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/CMakeLists.txt b/src/vendored/apache-arrow-12.0.1/arrow/python/CMakeLists.txt deleted file mode 100644 index ff355e4..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -arrow_install_all_headers("arrow/python") diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/api.h b/src/vendored/apache-arrow-12.0.1/arrow/python/api.h deleted file mode 100644 index a0b13d6..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/api.h +++ /dev/null @@ -1,30 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/python/arrow_to_pandas.h" -#include "arrow/python/common.h" -#include "arrow/python/datetime.h" -#include "arrow/python/deserialize.h" -#include "arrow/python/helpers.h" -#include "arrow/python/inference.h" -#include "arrow/python/io.h" -#include "arrow/python/numpy_convert.h" -#include "arrow/python/numpy_to_arrow.h" -#include "arrow/python/python_to_arrow.h" -#include "arrow/python/serialize.h" diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.cc deleted file mode 100644 index 91c7b8a..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.cc +++ /dev/null @@ -1,2575 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for pandas conversion via NumPy - -#include "arrow/python/arrow_to_pandas.h" -#include "arrow/python/numpy_interop.h" // IWYU pragma: expand - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/buffer.h" -#include "arrow/datum.h" -#include "arrow/status.h" -#include "arrow/table.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/hashing.h" -#include "arrow/util/int_util.h" -#include "arrow/util/logging.h" -#include "arrow/util/macros.h" -#include "arrow/util/parallel.h" -#include "arrow/visit_type_inline.h" - -#include "arrow/compute/api.h" - -#include "arrow/python/arrow_to_python_internal.h" -#include "arrow/python/common.h" -#include "arrow/python/datetime.h" -#include "arrow/python/decimal.h" -#include "arrow/python/helpers.h" -#include "arrow/python/numpy_convert.h" -#include "arrow/python/numpy_internal.h" -#include "arrow/python/pyarrow.h" -#include "arrow/python/python_to_arrow.h" -#include "arrow/python/type_traits.h" - -namespace arrow { - -class MemoryPool; - -using internal::checked_cast; -using internal::CheckIndexBounds; -using internal::OptionalParallelFor; - -namespace py { -namespace { - -// Fix options for conversion of an inner (child) array. -PandasOptions MakeInnerOptions(PandasOptions options) { - // Make sure conversion of inner dictionary arrays always returns an array, - // not a dict {'indices': array, 'dictionary': array, 'ordered': bool} - options.decode_dictionaries = true; - options.categorical_columns.clear(); - options.strings_to_categorical = false; - - // In ARROW-7723, we found as a result of ARROW-3789 that second - // through microsecond resolution tz-aware timestamps were being promoted to - // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy - // array in this function. PyArray_GETITEM returns datetime.datetime for - // units second through microsecond but PyLong for nanosecond (because - // datetime.datetime does not support nanoseconds). - // We force the object conversion to preserve the value of the timezone. - // Nanoseconds are returned as integers. - options.coerce_temporal_nanoseconds = false; - - return options; -} - -// ---------------------------------------------------------------------- -// PyCapsule code for setting ndarray base to reference C++ object - -struct ArrayCapsule { - std::shared_ptr array; -}; - -struct BufferCapsule { - std::shared_ptr buffer; -}; - -void ArrayCapsule_Destructor(PyObject* capsule) { - delete reinterpret_cast(PyCapsule_GetPointer(capsule, "arrow::Array")); -} - -void BufferCapsule_Destructor(PyObject* capsule) { - delete reinterpret_cast(PyCapsule_GetPointer(capsule, "arrow::Buffer")); -} - -// ---------------------------------------------------------------------- -// pandas 0.x DataFrame conversion internals - -using internal::arrow_traits; -using internal::npy_traits; - -template -struct WrapBytes {}; - -template <> -struct WrapBytes { - static inline PyObject* Wrap(const char* data, int64_t length) { - return PyUnicode_FromStringAndSize(data, length); - } -}; - -template <> -struct WrapBytes { - static inline PyObject* Wrap(const char* data, int64_t length) { - return PyUnicode_FromStringAndSize(data, length); - } -}; - -template <> -struct WrapBytes { - static inline PyObject* Wrap(const char* data, int64_t length) { - return PyBytes_FromStringAndSize(data, length); - } -}; - -template <> -struct WrapBytes { - static inline PyObject* Wrap(const char* data, int64_t length) { - return PyBytes_FromStringAndSize(data, length); - } -}; - -template <> -struct WrapBytes { - static inline PyObject* Wrap(const char* data, int64_t length) { - return PyBytes_FromStringAndSize(data, length); - } -}; - -static inline bool ListTypeSupported(const DataType& type) { - switch (type.id()) { - case Type::BOOL: - case Type::UINT8: - case Type::INT8: - case Type::UINT16: - case Type::INT16: - case Type::UINT32: - case Type::INT32: - case Type::INT64: - case Type::UINT64: - case Type::HALF_FLOAT: - case Type::FLOAT: - case Type::DOUBLE: - case Type::DECIMAL128: - case Type::DECIMAL256: - case Type::BINARY: - case Type::LARGE_BINARY: - case Type::STRING: - case Type::LARGE_STRING: - case Type::DATE32: - case Type::DATE64: - case Type::STRUCT: - case Type::MAP: - case Type::TIME32: - case Type::TIME64: - case Type::TIMESTAMP: - case Type::DURATION: - case Type::DICTIONARY: - case Type::INTERVAL_MONTH_DAY_NANO: - case Type::NA: // empty list - // The above types are all supported. - return true; - case Type::FIXED_SIZE_LIST: - case Type::LIST: - case Type::LARGE_LIST: { - const auto& list_type = checked_cast(type); - return ListTypeSupported(*list_type.value_type()); - } - case Type::EXTENSION: { - const auto& ext = checked_cast(*type.GetSharedPtr()); - return ListTypeSupported(*(ext.storage_type())); - } - default: - break; - } - return false; -} - -Status CapsulizeArray(const std::shared_ptr& arr, PyObject** out) { - auto capsule = new ArrayCapsule{{arr}}; - *out = PyCapsule_New(reinterpret_cast(capsule), "arrow::Array", - &ArrayCapsule_Destructor); - if (*out == nullptr) { - delete capsule; - RETURN_IF_PYERROR(); - } - return Status::OK(); -} - -Status CapsulizeBuffer(const std::shared_ptr& buffer, PyObject** out) { - auto capsule = new BufferCapsule{{buffer}}; - *out = PyCapsule_New(reinterpret_cast(capsule), "arrow::Buffer", - &BufferCapsule_Destructor); - if (*out == nullptr) { - delete capsule; - RETURN_IF_PYERROR(); - } - return Status::OK(); -} - -Status SetNdarrayBase(PyArrayObject* arr, PyObject* base) { - if (PyArray_SetBaseObject(arr, base) == -1) { - // Error occurred, trust that SetBaseObject sets the error state - Py_XDECREF(base); - RETURN_IF_PYERROR(); - } - return Status::OK(); -} - -Status SetBufferBase(PyArrayObject* arr, const std::shared_ptr& buffer) { - PyObject* base; - RETURN_NOT_OK(CapsulizeBuffer(buffer, &base)); - return SetNdarrayBase(arr, base); -} - -inline void set_numpy_metadata(int type, const DataType* datatype, PyArray_Descr* out) { - auto metadata = reinterpret_cast(out->c_metadata); - if (type == NPY_DATETIME) { - if (datatype->id() == Type::TIMESTAMP) { - const auto& timestamp_type = checked_cast(*datatype); - metadata->meta.base = internal::NumPyFrequency(timestamp_type.unit()); - } else { - DCHECK(false) << "NPY_DATETIME views only supported for Arrow TIMESTAMP types"; - } - } else if (type == NPY_TIMEDELTA) { - DCHECK_EQ(datatype->id(), Type::DURATION); - const auto& duration_type = checked_cast(*datatype); - metadata->meta.base = internal::NumPyFrequency(duration_type.unit()); - } -} - -Status PyArray_NewFromPool(int nd, npy_intp* dims, PyArray_Descr* descr, MemoryPool* pool, - PyObject** out) { - // ARROW-6570: Allocate memory from MemoryPool for a couple reasons - // - // * Track allocations - // * Get better performance through custom allocators - int64_t total_size = descr->elsize; - for (int i = 0; i < nd; ++i) { - total_size *= dims[i]; - } - - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(total_size, pool)); - *out = PyArray_NewFromDescr(&PyArray_Type, descr, nd, dims, - /*strides=*/nullptr, - /*data=*/buffer->mutable_data(), - /*flags=*/NPY_ARRAY_CARRAY | NPY_ARRAY_WRITEABLE, - /*obj=*/nullptr); - if (*out == nullptr) { - RETURN_IF_PYERROR(); - // Trust that error set if NULL returned - } - return SetBufferBase(reinterpret_cast(*out), std::move(buffer)); -} - -template -inline const T* GetPrimitiveValues(const Array& arr) { - if (arr.length() == 0) { - return nullptr; - } - const int elsize = arr.type()->byte_width(); - const auto& prim_arr = checked_cast(arr); - return reinterpret_cast(prim_arr.values()->data() + arr.offset() * elsize); -} - -Status MakeNumPyView(std::shared_ptr arr, PyObject* py_ref, int npy_type, int ndim, - npy_intp* dims, PyObject** out) { - PyAcquireGIL lock; - - PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type); - set_numpy_metadata(npy_type, arr->type().get(), descr); - PyObject* result = PyArray_NewFromDescr( - &PyArray_Type, descr, ndim, dims, /*strides=*/nullptr, - const_cast(GetPrimitiveValues(*arr)), /*flags=*/0, nullptr); - PyArrayObject* np_arr = reinterpret_cast(result); - if (np_arr == nullptr) { - // Error occurred, trust that error set - return Status::OK(); - } - - PyObject* base; - if (py_ref == nullptr) { - // Capsule will be owned by the ndarray, no incref necessary. See - // ARROW-1973 - RETURN_NOT_OK(CapsulizeArray(arr, &base)); - } else { - Py_INCREF(py_ref); - base = py_ref; - } - RETURN_NOT_OK(SetNdarrayBase(np_arr, base)); - - // Do not allow Arrow data to be mutated - PyArray_CLEARFLAGS(np_arr, NPY_ARRAY_WRITEABLE); - *out = result; - return Status::OK(); -} - -class PandasWriter { - public: - enum type { - OBJECT, - UINT8, - INT8, - UINT16, - INT16, - UINT32, - INT32, - UINT64, - INT64, - HALF_FLOAT, - FLOAT, - DOUBLE, - BOOL, - DATETIME_DAY, - DATETIME_SECOND, - DATETIME_MILLI, - DATETIME_MICRO, - DATETIME_NANO, - DATETIME_SECOND_TZ, - DATETIME_MILLI_TZ, - DATETIME_MICRO_TZ, - DATETIME_NANO_TZ, - TIMEDELTA_SECOND, - TIMEDELTA_MILLI, - TIMEDELTA_MICRO, - TIMEDELTA_NANO, - CATEGORICAL, - EXTENSION - }; - - PandasWriter(const PandasOptions& options, int64_t num_rows, int num_columns) - : options_(options), num_rows_(num_rows), num_columns_(num_columns) { - PyAcquireGIL lock; - internal::InitPandasStaticData(); - } - virtual ~PandasWriter() {} - - void SetBlockData(PyObject* arr) { - block_arr_.reset(arr); - block_data_ = - reinterpret_cast(PyArray_DATA(reinterpret_cast(arr))); - } - - /// \brief Either copy or wrap single array to create pandas-compatible array - /// for Series or DataFrame. num_columns_ can only be 1. Will try to zero - /// copy if possible (or error if not possible and zero_copy_only=True) - virtual Status TransferSingle(std::shared_ptr data, PyObject* py_ref) = 0; - - /// \brief Copy ChunkedArray into a multi-column block - virtual Status CopyInto(std::shared_ptr data, int64_t rel_placement) = 0; - - Status EnsurePlacementAllocated() { - std::lock_guard guard(allocation_lock_); - if (placement_data_ != nullptr) { - return Status::OK(); - } - PyAcquireGIL lock; - npy_intp placement_dims[1] = {num_columns_}; - PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64); - RETURN_IF_PYERROR(); - placement_arr_.reset(placement_arr); - placement_data_ = reinterpret_cast( - PyArray_DATA(reinterpret_cast(placement_arr))); - return Status::OK(); - } - - Status EnsureAllocated() { - std::lock_guard guard(allocation_lock_); - if (block_data_ != nullptr) { - return Status::OK(); - } - RETURN_NOT_OK(Allocate()); - return Status::OK(); - } - - virtual bool CanZeroCopy(const ChunkedArray& data) const { return false; } - - virtual Status Write(std::shared_ptr data, int64_t abs_placement, - int64_t rel_placement) { - RETURN_NOT_OK(EnsurePlacementAllocated()); - if (num_columns_ == 1 && options_.allow_zero_copy_blocks) { - RETURN_NOT_OK(TransferSingle(data, /*py_ref=*/nullptr)); - } else { - RETURN_NOT_OK( - CheckNoZeroCopy("Cannot do zero copy conversion into " - "multi-column DataFrame block")); - RETURN_NOT_OK(EnsureAllocated()); - RETURN_NOT_OK(CopyInto(data, rel_placement)); - } - placement_data_[rel_placement] = abs_placement; - return Status::OK(); - } - - virtual Status GetDataFrameResult(PyObject** out) { - PyObject* result = PyDict_New(); - RETURN_IF_PYERROR(); - - PyObject* block; - RETURN_NOT_OK(GetResultBlock(&block)); - - PyDict_SetItemString(result, "block", block); - PyDict_SetItemString(result, "placement", placement_arr_.obj()); - - RETURN_NOT_OK(AddResultMetadata(result)); - *out = result; - return Status::OK(); - } - - // Caller steals the reference to this object - virtual Status GetSeriesResult(PyObject** out) { - RETURN_NOT_OK(MakeBlock1D()); - // Caller owns the object now - *out = block_arr_.detach(); - return Status::OK(); - } - - protected: - virtual Status AddResultMetadata(PyObject* result) { return Status::OK(); } - - Status MakeBlock1D() { - // For Series or for certain DataFrame block types, we need to shape to a - // 1D array when there is only one column - PyAcquireGIL lock; - - DCHECK_EQ(1, num_columns_); - - npy_intp new_dims[1] = {static_cast(num_rows_)}; - PyArray_Dims dims; - dims.ptr = new_dims; - dims.len = 1; - - PyObject* reshaped = PyArray_Newshape( - reinterpret_cast(block_arr_.obj()), &dims, NPY_ANYORDER); - RETURN_IF_PYERROR(); - - // ARROW-8801: Here a PyArrayObject is created that is not being managed by - // any OwnedRef object. This object is then put in the resulting object - // with PyDict_SetItemString, which increments the reference count, so a - // memory leak ensues. There are several ways to fix the memory leak but a - // simple one is to put the reshaped 1D block array in this OwnedRefNoGIL - // so it will be correctly decref'd when this class is destructed. - block_arr_.reset(reshaped); - return Status::OK(); - } - - virtual Status GetResultBlock(PyObject** out) { - *out = block_arr_.obj(); - return Status::OK(); - } - - Status CheckNoZeroCopy(const std::string& message) { - if (options_.zero_copy_only) { - return Status::Invalid(message); - } - return Status::OK(); - } - - Status CheckNotZeroCopyOnly(const ChunkedArray& data) { - if (options_.zero_copy_only) { - return Status::Invalid("Needed to copy ", data.num_chunks(), " chunks with ", - data.null_count(), " nulls, but zero_copy_only was True"); - } - return Status::OK(); - } - - virtual Status Allocate() { - return Status::NotImplemented("Override Allocate in subclasses"); - } - - Status AllocateNDArray(int npy_type, int ndim = 2) { - PyAcquireGIL lock; - - PyObject* block_arr = nullptr; - npy_intp block_dims[2] = {0, 0}; - - if (ndim == 2) { - block_dims[0] = num_columns_; - block_dims[1] = num_rows_; - } else { - block_dims[0] = num_rows_; - } - PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type); - if (PyDataType_REFCHK(descr)) { - // ARROW-6876: if the array has refcounted items, let Numpy - // own the array memory so as to decref elements on array destruction - block_arr = PyArray_SimpleNewFromDescr(ndim, block_dims, descr); - RETURN_IF_PYERROR(); - } else { - RETURN_NOT_OK( - PyArray_NewFromPool(ndim, block_dims, descr, options_.pool, &block_arr)); - } - - SetBlockData(block_arr); - return Status::OK(); - } - - void SetDatetimeUnit(NPY_DATETIMEUNIT unit) { - PyAcquireGIL lock; - auto date_dtype = reinterpret_cast( - PyArray_DESCR(reinterpret_cast(block_arr_.obj()))->c_metadata); - date_dtype->meta.base = unit; - } - - PandasOptions options_; - - std::mutex allocation_lock_; - - int64_t num_rows_; - int num_columns_; - - OwnedRefNoGIL block_arr_; - uint8_t* block_data_ = nullptr; - - // ndarray - OwnedRefNoGIL placement_arr_; - int64_t* placement_data_ = nullptr; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(PandasWriter); -}; - -template -inline void ConvertIntegerWithNulls(const PandasOptions& options, - const ChunkedArray& data, OutType* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = *data.chunk(c); - const InType* in_values = GetPrimitiveValues(arr); - // Upcast to double, set NaN as appropriate - - for (int i = 0; i < arr.length(); ++i) { - *out_values++ = - arr.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); - } - } -} - -template -inline void ConvertIntegerNoNullsSameType(const PandasOptions& options, - const ChunkedArray& data, T* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = *data.chunk(c); - if (arr.length() > 0) { - const T* in_values = GetPrimitiveValues(arr); - memcpy(out_values, in_values, sizeof(T) * arr.length()); - out_values += arr.length(); - } - } -} - -template -inline void ConvertIntegerNoNullsCast(const PandasOptions& options, - const ChunkedArray& data, OutType* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = *data.chunk(c); - const InType* in_values = GetPrimitiveValues(arr); - for (int64_t i = 0; i < arr.length(); ++i) { - *out_values = in_values[i]; - } - } -} - -template -struct MemoizationTraits { - using Scalar = typename T::c_type; -}; - -template -struct MemoizationTraits> { - // For binary, we memoize string_view as a scalar value to avoid having to - // unnecessarily copy the memory into the memo table data structure - using Scalar = std::string_view; -}; - -// Generic Array -> PyObject** converter that handles object deduplication, if -// requested -template -inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data, - WrapFunction&& wrap_func, PyObject** out_values) { - using ArrayType = typename TypeTraits::ArrayType; - using Scalar = typename MemoizationTraits::Scalar; - - ::arrow::internal::ScalarMemoTable memo_table(options.pool); - std::vector unique_values; - int32_t memo_size = 0; - - auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { - int32_t memo_index; - RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index)); - if (memo_index == memo_size) { - // New entry - RETURN_NOT_OK(wrap_func(value, out_values)); - unique_values.push_back(*out_values); - ++memo_size; - } else { - // Duplicate entry - Py_INCREF(unique_values[memo_index]); - *out_values = unique_values[memo_index]; - } - return Status::OK(); - }; - - auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) { - return wrap_func(value, out_values); - }; - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = arrow::internal::checked_cast(*data.chunk(c)); - if (options.deduplicate_objects) { - RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapMemoized, out_values)); - } else { - RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapUnmemoized, out_values)); - } - out_values += arr.length(); - } - return Status::OK(); -} - -Status ConvertStruct(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { - if (data.num_chunks() == 0) { - return Status::OK(); - } - // ChunkedArray has at least one chunk - auto arr = checked_cast(data.chunk(0).get()); - // Use it to cache the struct type and number of fields for all chunks - int32_t num_fields = arr->num_fields(); - auto array_type = arr->type(); - std::vector fields_data(num_fields * data.num_chunks()); - OwnedRef dict_item; - - // See notes in MakeInnerOptions. - options = MakeInnerOptions(std::move(options)); - // Don't blindly convert because timestamps in lists are handled differently. - options.timestamp_as_object = true; - - for (int c = 0; c < data.num_chunks(); c++) { - auto fields_data_offset = c * num_fields; - auto arr = checked_cast(data.chunk(c).get()); - // Convert the struct arrays first - for (int32_t i = 0; i < num_fields; i++) { - auto field = arr->field(static_cast(i)); - // In case the field is an extension array, use .storage() to convert to Pandas - if (field->type()->id() == Type::EXTENSION) { - const ExtensionArray& arr_ext = checked_cast(*field); - field = arr_ext.storage(); - } - RETURN_NOT_OK(ConvertArrayToPandas(options, field, nullptr, - fields_data[i + fields_data_offset].ref())); - DCHECK(PyArray_Check(fields_data[i + fields_data_offset].obj())); - } - - // Construct a dictionary for each row - const bool has_nulls = data.null_count() > 0; - for (int64_t i = 0; i < arr->length(); ++i) { - if (has_nulls && arr->IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - // Build the new dict object for the row - dict_item.reset(PyDict_New()); - RETURN_IF_PYERROR(); - for (int32_t field_idx = 0; field_idx < num_fields; ++field_idx) { - OwnedRef field_value; - auto name = array_type->field(static_cast(field_idx))->name(); - if (!arr->field(static_cast(field_idx))->IsNull(i)) { - // Value exists in child array, obtain it - auto array = reinterpret_cast( - fields_data[field_idx + fields_data_offset].obj()); - auto ptr = reinterpret_cast(PyArray_GETPTR1(array, i)); - field_value.reset(PyArray_GETITEM(array, ptr)); - RETURN_IF_PYERROR(); - } else { - // Translate the Null to a None - Py_INCREF(Py_None); - field_value.reset(Py_None); - } - // PyDict_SetItemString increments reference count - auto setitem_result = - PyDict_SetItemString(dict_item.obj(), name.c_str(), field_value.obj()); - RETURN_IF_PYERROR(); - DCHECK_EQ(setitem_result, 0); - } - *out_values = dict_item.obj(); - // Grant ownership to the resulting array - Py_INCREF(*out_values); - } - ++out_values; - } - } - return Status::OK(); -} - -Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr& dense_type, - ArrayVector* arrays) { - compute::ExecContext ctx(pool); - compute::CastOptions options; - for (size_t i = 0; i < arrays->size(); ++i) { - ARROW_ASSIGN_OR_RAISE((*arrays)[i], - compute::Cast(*(*arrays)[i], dense_type, options, &ctx)); - } - return Status::OK(); -} - -Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr& dense_type, - std::shared_ptr* array) { - auto chunks = (*array)->chunks(); - RETURN_NOT_OK(DecodeDictionaries(pool, dense_type, &chunks)); - *array = std::make_shared(std::move(chunks), dense_type); - return Status::OK(); -} - -template -Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { - // Get column of underlying value arrays - ArrayVector value_arrays; - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - // values() does not account for offsets, so we need to slice into it. - // We can't use Flatten(), because it removes the values behind a null list - // value, and that makes the offsets into original list values and our - // flattened_values array different. - std::shared_ptr flattened_values = arr.values()->Slice( - arr.value_offset(0), arr.value_offset(arr.length()) - arr.value_offset(0)); - if (arr.value_type()->id() == Type::EXTENSION) { - const auto& arr_ext = checked_cast(*flattened_values); - value_arrays.emplace_back(arr_ext.storage()); - } else { - value_arrays.emplace_back(flattened_values); - } - } - - using ListArrayType = typename ListArrayT::TypeClass; - const auto& list_type = checked_cast(*data.type()); - auto value_type = list_type.value_type(); - if (value_type->id() == Type::EXTENSION) { - value_type = checked_cast(*value_type).storage_type(); - } - - auto flat_column = std::make_shared(value_arrays, value_type); - - options = MakeInnerOptions(std::move(options)); - - OwnedRefNoGIL owned_numpy_array; - RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_column, nullptr, - owned_numpy_array.ref())); - PyObject* numpy_array = owned_numpy_array.obj(); - DCHECK(PyArray_Check(numpy_array)); - - int64_t chunk_offset = 0; - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - const bool has_nulls = data.null_count() > 0; - for (int64_t i = 0; i < arr.length(); ++i) { - if (has_nulls && arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - // Need to subtract value_offset(0) since the original chunk might be a slice - // into another array. - OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset - - arr.value_offset(0))); - OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset - - arr.value_offset(0))); - OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr)); - - if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) { - // Fall out of loop, will return from RETURN_IF_PYERROR - break; - } - *out_values = PyObject_GetItem(numpy_array, slice.obj()); - - if (*out_values == nullptr) { - // Fall out of loop, will return from RETURN_IF_PYERROR - break; - } - } - ++out_values; - } - RETURN_IF_PYERROR(); - - chunk_offset += arr.value_offset(arr.length()) - arr.value_offset(0); - } - - return Status::OK(); -} - -template -Status ConvertMapHelper(F1 resetRow, F2 addPairToRow, F3 stealRow, - const ChunkedArray& data, PyArrayObject* py_keys, - PyArrayObject* py_items, - // needed for null checks in items - const std::vector> item_arrays, - PyObject** out_values) { - OwnedRef key_value; - OwnedRef item_value; - - int64_t chunk_offset = 0; - for (int c = 0; c < data.num_chunks(); ++c) { - const auto& arr = checked_cast(*data.chunk(c)); - const bool has_nulls = data.null_count() > 0; - - // Make a list of key/item pairs for each row in array - for (int64_t i = 0; i < arr.length(); ++i) { - if (has_nulls && arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - int64_t entry_offset = arr.value_offset(i); - int64_t num_pairs = arr.value_offset(i + 1) - entry_offset; - - // Build the new list object for the row of Python pairs - RETURN_NOT_OK(resetRow(num_pairs)); - - // Add each key/item pair in the row - for (int64_t j = 0; j < num_pairs; ++j) { - // Get key value, key is non-nullable for a valid row - auto ptr_key = reinterpret_cast( - PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j)); - key_value.reset(PyArray_GETITEM(py_keys, ptr_key)); - RETURN_IF_PYERROR(); - - if (item_arrays[c]->IsNull(entry_offset + j)) { - // Translate the Null to a None - Py_INCREF(Py_None); - item_value.reset(Py_None); - } else { - // Get valid value from item array - auto ptr_item = reinterpret_cast( - PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j)); - item_value.reset(PyArray_GETITEM(py_items, ptr_item)); - RETURN_IF_PYERROR(); - } - - // Add the key/item pair to the row - RETURN_NOT_OK(addPairToRow(j, key_value, item_value)); - } - - // Pass ownership to the resulting array - *out_values = stealRow(); - } - ++out_values; - } - RETURN_IF_PYERROR(); - - chunk_offset += arr.values()->length(); - } - - return Status::OK(); -} - -// A more helpful error message around TypeErrors that may stem from unhashable keys -Status CheckMapAsPydictsTypeError() { - if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { - return Status::OK(); - } - if (PyErr_ExceptionMatches(PyExc_TypeError)) { - // Modify the error string directly, so it is re-raised - // with our additional info. - // - // There are not many interesting things happening when this - // is hit. This is intended to only be called directly after - // PyDict_SetItem, where a finite set of errors could occur. - PyObject *type, *value, *traceback; - PyErr_Fetch(&type, &value, &traceback); - std::string message; - RETURN_NOT_OK(internal::PyObject_StdStringStr(value, &message)); - message += - ". If keys are not hashable, then you must use the option " - "[maps_as_pydicts=None (default)]"; - - // resets the error - PyErr_SetString(PyExc_TypeError, message.c_str()); - } - return ConvertPyError(); -} - -Status CheckForDuplicateKeys(bool error_on_duplicate_keys, Py_ssize_t total_dict_len, - Py_ssize_t total_raw_len) { - if (total_dict_len < total_raw_len) { - const char* message = - "[maps_as_pydicts] " - "After conversion of Arrow maps to pydicts, " - "detected data loss due to duplicate keys. " - "Original input length is [%lld], total converted pydict length is [%lld]."; - std::array buf; - std::snprintf(buf.data(), buf.size(), message, total_raw_len, total_dict_len); - - if (error_on_duplicate_keys) { - return Status::UnknownError(buf.data()); - } else { - ARROW_LOG(WARNING) << buf.data(); - } - } - return Status::OK(); -} - -Status ConvertMap(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { - // Get columns of underlying key/item arrays - std::vector> key_arrays; - std::vector> item_arrays; - for (int c = 0; c < data.num_chunks(); ++c) { - const auto& map_arr = checked_cast(*data.chunk(c)); - key_arrays.emplace_back(map_arr.keys()); - item_arrays.emplace_back(map_arr.items()); - } - - const auto& map_type = checked_cast(*data.type()); - auto key_type = map_type.key_type(); - auto item_type = map_type.item_type(); - - // ARROW-6899: Convert dictionary-encoded children to dense instead of - // failing below. A more efficient conversion than this could be done later - if (key_type->id() == Type::DICTIONARY) { - auto dense_type = checked_cast(*key_type).value_type(); - RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays)); - key_type = dense_type; - } - if (item_type->id() == Type::DICTIONARY) { - auto dense_type = checked_cast(*item_type).value_type(); - RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays)); - item_type = dense_type; - } - - // See notes in MakeInnerOptions. - options = MakeInnerOptions(std::move(options)); - // Don't blindly convert because timestamps in lists are handled differently. - options.timestamp_as_object = true; - - auto flat_keys = std::make_shared(key_arrays, key_type); - auto flat_items = std::make_shared(item_arrays, item_type); - OwnedRefNoGIL owned_numpy_keys; - RETURN_NOT_OK( - ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref())); - OwnedRefNoGIL owned_numpy_items; - RETURN_NOT_OK( - ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref())); - PyArrayObject* py_keys = reinterpret_cast(owned_numpy_keys.obj()); - PyArrayObject* py_items = reinterpret_cast(owned_numpy_items.obj()); - - if (options.maps_as_pydicts == MapConversionType::DEFAULT) { - // The default behavior to express an Arrow MAP as a list of [(key, value), ...] pairs - OwnedRef list_item; - return ConvertMapHelper( - [&list_item](int64_t num_pairs) { - list_item.reset(PyList_New(num_pairs)); - return CheckPyError(); - }, - [&list_item](int64_t idx, OwnedRef& key_value, OwnedRef& item_value) { - PyList_SET_ITEM(list_item.obj(), idx, - PyTuple_Pack(2, key_value.obj(), item_value.obj())); - return CheckPyError(); - }, - [&list_item] { return list_item.detach(); }, data, py_keys, py_items, item_arrays, - out_values); - } else { - // Use a native pydict - OwnedRef dict_item; - Py_ssize_t total_dict_len{0}; - Py_ssize_t total_raw_len{0}; - - bool error_on_duplicate_keys; - if (options.maps_as_pydicts == MapConversionType::LOSSY) { - error_on_duplicate_keys = false; - } else if (options.maps_as_pydicts == MapConversionType::STRICT_) { - error_on_duplicate_keys = true; - } else { - auto val = std::underlying_type_t(options.maps_as_pydicts); - return Status::UnknownError("Received unknown option for maps_as_pydicts: " + - std::to_string(val)); - } - - auto status = ConvertMapHelper( - [&dict_item, &total_raw_len](int64_t num_pairs) { - total_raw_len += num_pairs; - dict_item.reset(PyDict_New()); - return CheckPyError(); - }, - [&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value, - OwnedRef& item_value) { - auto setitem_result = - PyDict_SetItem(dict_item.obj(), key_value.obj(), item_value.obj()); - ARROW_RETURN_NOT_OK(CheckMapAsPydictsTypeError()); - // returns -1 if there are internal errors around hashing/resizing - return setitem_result == 0 ? Status::OK() - : Status::UnknownError( - "[maps_as_pydicts] " - "Unexpected failure inserting Arrow (key, " - "value) pair into Python dict"); - }, - [&dict_item, &total_dict_len] { - total_dict_len += PyDict_Size(dict_item.obj()); - return dict_item.detach(); - }, - data, py_keys, py_items, item_arrays, out_values); - - ARROW_RETURN_NOT_OK(status); - // If there were no errors generating the pydicts, - // then check if we detected any data loss from duplicate keys. - return CheckForDuplicateKeys(error_on_duplicate_keys, total_dict_len, total_raw_len); - } -} - -template -inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value, - OutType* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = *data.chunk(c); - const InType* in_values = GetPrimitiveValues(arr); - - if (arr.null_count() > 0) { - for (int64_t i = 0; i < arr.length(); ++i) { - *out_values++ = arr.IsNull(i) ? na_value : in_values[i]; - } - } else { - memcpy(out_values, in_values, sizeof(InType) * arr.length()); - out_values += arr.length(); - } - } -} - -template -inline void ConvertNumericNullableCast(const ChunkedArray& data, InType na_value, - OutType* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = *data.chunk(c); - const InType* in_values = GetPrimitiveValues(arr); - - for (int64_t i = 0; i < arr.length(); ++i) { - *out_values++ = arr.IsNull(i) ? static_cast(na_value) - : static_cast(in_values[i]); - } - } -} - -template -class TypedPandasWriter : public PandasWriter { - public: - using T = typename npy_traits::value_type; - - using PandasWriter::PandasWriter; - - Status TransferSingle(std::shared_ptr data, PyObject* py_ref) override { - if (CanZeroCopy(*data)) { - PyObject* wrapped; - npy_intp dims[2] = {static_cast(num_columns_), - static_cast(num_rows_)}; - RETURN_NOT_OK( - MakeNumPyView(data->chunk(0), py_ref, NPY_TYPE, /*ndim=*/2, dims, &wrapped)); - SetBlockData(wrapped); - return Status::OK(); - } else { - RETURN_NOT_OK(CheckNotZeroCopyOnly(*data)); - RETURN_NOT_OK(EnsureAllocated()); - return CopyInto(data, /*rel_placement=*/0); - } - } - - Status CheckTypeExact(const DataType& type, Type::type expected) { - if (type.id() != expected) { - // TODO(wesm): stringify NumPy / pandas type - return Status::NotImplemented("Cannot write Arrow data of type ", type.ToString()); - } - return Status::OK(); - } - - T* GetBlockColumnStart(int64_t rel_placement) { - return reinterpret_cast(block_data_) + rel_placement * num_rows_; - } - - protected: - Status Allocate() override { return AllocateNDArray(NPY_TYPE); } -}; - -struct ObjectWriterVisitor { - const PandasOptions& options; - const ChunkedArray& data; - PyObject** out_values; - - Status Visit(const NullType& type) { - for (int c = 0; c < data.num_chunks(); c++) { - std::shared_ptr arr = data.chunk(c); - - for (int64_t i = 0; i < arr->length(); ++i) { - // All values are null - Py_INCREF(Py_None); - *out_values = Py_None; - ++out_values; - } - } - return Status::OK(); - } - - Status Visit(const BooleanType& type) { - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else if (arr.Value(i)) { - // True - Py_INCREF(Py_True); - *out_values++ = Py_True; - } else { - // False - Py_INCREF(Py_False); - *out_values++ = Py_False; - } - } - } - return Status::OK(); - } - - template - enable_if_integer Visit(const Type& type) { - using T = typename Type::c_type; - auto WrapValue = [](T value, PyObject** out) { - *out = std::is_signed::value ? PyLong_FromLongLong(value) - : PyLong_FromUnsignedLongLong(value); - RETURN_IF_PYERROR(); - return Status::OK(); - }; - return ConvertAsPyObjects(options, data, WrapValue, out_values); - } - - template - enable_if_t::value || is_fixed_size_binary_type::value, - Status> - Visit(const Type& type) { - auto WrapValue = [](const std::string_view& view, PyObject** out) { - *out = WrapBytes::Wrap(view.data(), view.length()); - if (*out == nullptr) { - PyErr_Clear(); - return Status::UnknownError("Wrapping ", view, " failed"); - } - return Status::OK(); - }; - return ConvertAsPyObjects(options, data, WrapValue, out_values); - } - - template - enable_if_date Visit(const Type& type) { - auto WrapValue = [](typename Type::c_type value, PyObject** out) { - RETURN_NOT_OK(internal::PyDate_from_int(value, Type::UNIT, out)); - RETURN_IF_PYERROR(); - return Status::OK(); - }; - return ConvertAsPyObjects(options, data, WrapValue, out_values); - } - - template - enable_if_time Visit(const Type& type) { - const TimeUnit::type unit = type.unit(); - auto WrapValue = [unit](typename Type::c_type value, PyObject** out) { - RETURN_NOT_OK(internal::PyTime_from_int(value, unit, out)); - RETURN_IF_PYERROR(); - return Status::OK(); - }; - return ConvertAsPyObjects(options, data, WrapValue, out_values); - } - - template - enable_if_timestamp Visit(const Type& type) { - const TimeUnit::type unit = type.unit(); - OwnedRef tzinfo; - - auto ConvertTimezoneNaive = [&](typename Type::c_type value, PyObject** out) { - RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out)); - RETURN_IF_PYERROR(); - return Status::OK(); - }; - auto ConvertTimezoneAware = [&](typename Type::c_type value, PyObject** out) { - PyObject* naive_datetime; - RETURN_NOT_OK(ConvertTimezoneNaive(value, &naive_datetime)); - - // convert the timezone naive datetime object to timezone aware - // two step conversion of the datetime mimics Python's code: - // dt.replace(tzinfo=datetime.timezone.utc).astimezone(tzinfo) - // first step: replacing timezone with timezone.utc (replace method) - OwnedRef args(PyTuple_New(0)); - OwnedRef keywords(PyDict_New()); - PyDict_SetItemString(keywords.obj(), "tzinfo", PyDateTime_TimeZone_UTC); - OwnedRef naive_datetime_replace(PyObject_GetAttrString(naive_datetime, "replace")); - OwnedRef datetime_utc( - PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj())); - // second step: adjust the datetime to tzinfo timezone (astimezone method) - *out = PyObject_CallMethod(datetime_utc.obj(), "astimezone", "O", tzinfo.obj()); - - // the timezone naive object is no longer required - Py_DECREF(naive_datetime); - RETURN_IF_PYERROR(); - - return Status::OK(); - }; - - if (!type.timezone().empty() && !options.ignore_timezone) { - // convert timezone aware - PyObject* tzobj; - ARROW_ASSIGN_OR_RAISE(tzobj, internal::StringToTzinfo(type.timezone())); - tzinfo.reset(tzobj); - RETURN_IF_PYERROR(); - RETURN_NOT_OK( - ConvertAsPyObjects(options, data, ConvertTimezoneAware, out_values)); - } else { - // convert timezone naive - RETURN_NOT_OK( - ConvertAsPyObjects(options, data, ConvertTimezoneNaive, out_values)); - } - - return Status::OK(); - } - - template - enable_if_t::value, Status> Visit( - const Type& type) { - OwnedRef args(PyTuple_New(0)); - OwnedRef kwargs(PyDict_New()); - RETURN_IF_PYERROR(); - auto to_date_offset = [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, - PyObject** out) { - DCHECK(internal::BorrowPandasDataOffsetType() != nullptr); - // DateOffset objects do not add nanoseconds component to pd.Timestamp. - // as of Pandas 1.3.3 - // (https://github.com/pandas-dev/pandas/issues/43892). - // So convert microseconds and remainder to preserve data - // but give users more expected results. - int64_t microseconds = interval.nanoseconds / 1000; - int64_t nanoseconds; - if (interval.nanoseconds >= 0) { - nanoseconds = interval.nanoseconds % 1000; - } else { - nanoseconds = -((-interval.nanoseconds) % 1000); - } - - PyDict_SetItemString(kwargs.obj(), "months", PyLong_FromLong(interval.months)); - PyDict_SetItemString(kwargs.obj(), "days", PyLong_FromLong(interval.days)); - PyDict_SetItemString(kwargs.obj(), "microseconds", - PyLong_FromLongLong(microseconds)); - PyDict_SetItemString(kwargs.obj(), "nanoseconds", PyLong_FromLongLong(nanoseconds)); - *out = - PyObject_Call(internal::BorrowPandasDataOffsetType(), args.obj(), kwargs.obj()); - RETURN_IF_PYERROR(); - return Status::OK(); - }; - return ConvertAsPyObjects(options, data, to_date_offset, - out_values); - } - - Status Visit(const Decimal128Type& type) { - OwnedRef decimal; - OwnedRef Decimal; - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal)); - RETURN_NOT_OK(internal::ImportFromModule(decimal.obj(), "Decimal", &Decimal)); - PyObject* decimal_constructor = Decimal.obj(); - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - *out_values++ = - internal::DecimalFromString(decimal_constructor, arr.FormatValue(i)); - RETURN_IF_PYERROR(); - } - } - } - - return Status::OK(); - } - - Status Visit(const Decimal256Type& type) { - OwnedRef decimal; - OwnedRef Decimal; - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal)); - RETURN_NOT_OK(internal::ImportFromModule(decimal.obj(), "Decimal", &Decimal)); - PyObject* decimal_constructor = Decimal.obj(); - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - *out_values++ = - internal::DecimalFromString(decimal_constructor, arr.FormatValue(i)); - RETURN_IF_PYERROR(); - } - } - } - - return Status::OK(); - } - - template - enable_if_t::value || is_var_length_list_type::value, - Status> - Visit(const T& type) { - using ArrayType = typename TypeTraits::ArrayType; - if (!ListTypeSupported(*type.value_type())) { - return Status::NotImplemented( - "Not implemented type for conversion from List to Pandas: ", - type.value_type()->ToString()); - } - return ConvertListsLike(options, data, out_values); - } - - Status Visit(const MapType& type) { return ConvertMap(options, data, out_values); } - - Status Visit(const StructType& type) { - return ConvertStruct(options, data, out_values); - } - - template - enable_if_t::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - (std::is_base_of::value && - !std::is_same::value) || - std::is_base_of::value, - Status> - Visit(const Type& type) { - return Status::NotImplemented("No implemented conversion to object dtype: ", - type.ToString()); - } -}; - -class ObjectWriter : public TypedPandasWriter { - public: - using TypedPandasWriter::TypedPandasWriter; - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - PyAcquireGIL lock; - ObjectWriterVisitor visitor{this->options_, *data, - this->GetBlockColumnStart(rel_placement)}; - return VisitTypeInline(*data->type(), &visitor); - } -}; - -static inline bool IsNonNullContiguous(const ChunkedArray& data) { - return data.num_chunks() == 1 && data.null_count() == 0; -} - -template -class IntWriter : public TypedPandasWriter { - public: - using ArrowType = typename npy_traits::TypeClass; - using TypedPandasWriter::TypedPandasWriter; - - bool CanZeroCopy(const ChunkedArray& data) const override { - return IsNonNullContiguous(data); - } - - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - RETURN_NOT_OK(this->CheckTypeExact(*data->type(), ArrowType::type_id)); - ConvertIntegerNoNullsSameType( - this->options_, *data, this->GetBlockColumnStart(rel_placement)); - return Status::OK(); - } -}; - -template -class FloatWriter : public TypedPandasWriter { - public: - using ArrowType = typename npy_traits::TypeClass; - using TypedPandasWriter::TypedPandasWriter; - using T = typename ArrowType::c_type; - - bool CanZeroCopy(const ChunkedArray& data) const override { - return IsNonNullContiguous(data) && data.type()->id() == ArrowType::type_id; - } - - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - Type::type in_type = data->type()->id(); - auto out_values = this->GetBlockColumnStart(rel_placement); - -#define INTEGER_CASE(IN_TYPE) \ - ConvertIntegerWithNulls(this->options_, *data, out_values); \ - break; - - switch (in_type) { - case Type::UINT8: - INTEGER_CASE(uint8_t); - case Type::INT8: - INTEGER_CASE(int8_t); - case Type::UINT16: - INTEGER_CASE(uint16_t); - case Type::INT16: - INTEGER_CASE(int16_t); - case Type::UINT32: - INTEGER_CASE(uint32_t); - case Type::INT32: - INTEGER_CASE(int32_t); - case Type::UINT64: - INTEGER_CASE(uint64_t); - case Type::INT64: - INTEGER_CASE(int64_t); - case Type::HALF_FLOAT: - ConvertNumericNullableCast(*data, npy_traits::na_sentinel, out_values); - case Type::FLOAT: - ConvertNumericNullableCast(*data, npy_traits::na_sentinel, out_values); - break; - case Type::DOUBLE: - ConvertNumericNullableCast(*data, npy_traits::na_sentinel, out_values); - break; - default: - return Status::NotImplemented("Cannot write Arrow data of type ", - data->type()->ToString(), - " to a Pandas floating point block"); - } - -#undef INTEGER_CASE - - return Status::OK(); - } -}; - -using UInt8Writer = IntWriter; -using Int8Writer = IntWriter; -using UInt16Writer = IntWriter; -using Int16Writer = IntWriter; -using UInt32Writer = IntWriter; -using Int32Writer = IntWriter; -using UInt64Writer = IntWriter; -using Int64Writer = IntWriter; -using Float16Writer = FloatWriter; -using Float32Writer = FloatWriter; -using Float64Writer = FloatWriter; - -class BoolWriter : public TypedPandasWriter { - public: - using TypedPandasWriter::TypedPandasWriter; - - Status TransferSingle(std::shared_ptr data, PyObject* py_ref) override { - RETURN_NOT_OK( - CheckNoZeroCopy("Zero copy conversions not possible with " - "boolean types")); - RETURN_NOT_OK(EnsureAllocated()); - return CopyInto(data, /*rel_placement=*/0); - } - - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - RETURN_NOT_OK(this->CheckTypeExact(*data->type(), Type::BOOL)); - auto out_values = this->GetBlockColumnStart(rel_placement); - for (int c = 0; c < data->num_chunks(); c++) { - const auto& arr = checked_cast(*data->chunk(c)); - for (int64_t i = 0; i < arr.length(); ++i) { - *out_values++ = static_cast(arr.Value(i)); - } - } - return Status::OK(); - } -}; - -// ---------------------------------------------------------------------- -// Date / timestamp types - -template -inline void ConvertDatetime(const ChunkedArray& data, int64_t* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = *data.chunk(c); - const T* in_values = GetPrimitiveValues(arr); - - for (int64_t i = 0; i < arr.length(); ++i) { - *out_values++ = arr.IsNull(i) ? kPandasTimestampNull - : (static_cast(in_values[i]) * SHIFT); - } - } -} - -template -void ConvertDatesShift(const ChunkedArray& data, int64_t* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = *data.chunk(c); - const T* in_values = GetPrimitiveValues(arr); - for (int64_t i = 0; i < arr.length(); ++i) { - *out_values++ = arr.IsNull(i) ? kPandasTimestampNull - : static_cast(in_values[i]) / SHIFT; - } - } -} - -class DatetimeDayWriter : public TypedPandasWriter { - public: - using TypedPandasWriter::TypedPandasWriter; - - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - int64_t* out_values = this->GetBlockColumnStart(rel_placement); - const auto& type = checked_cast(*data->type()); - switch (type.unit()) { - case DateUnit::DAY: - ConvertDatesShift(*data, out_values); - break; - case DateUnit::MILLI: - ConvertDatesShift(*data, out_values); - break; - } - return Status::OK(); - } - - protected: - Status Allocate() override { - RETURN_NOT_OK(this->AllocateNDArray(NPY_DATETIME)); - SetDatetimeUnit(NPY_FR_D); - return Status::OK(); - } -}; - -template -class DatetimeWriter : public TypedPandasWriter { - public: - using TypedPandasWriter::TypedPandasWriter; - - bool CanZeroCopy(const ChunkedArray& data) const override { - if (data.type()->id() == Type::TIMESTAMP) { - const auto& type = checked_cast(*data.type()); - return IsNonNullContiguous(data) && type.unit() == UNIT; - } else { - return false; - } - } - - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - const auto& ts_type = checked_cast(*data->type()); - DCHECK_EQ(UNIT, ts_type.unit()) << "Should only call instances of this writer " - << "with arrays of the correct unit"; - ConvertNumericNullable(*data, kPandasTimestampNull, - this->GetBlockColumnStart(rel_placement)); - return Status::OK(); - } - - protected: - Status Allocate() override { - RETURN_NOT_OK(this->AllocateNDArray(NPY_DATETIME)); - SetDatetimeUnit(internal::NumPyFrequency(UNIT)); - return Status::OK(); - } -}; - -using DatetimeSecondWriter = DatetimeWriter; - -class DatetimeMilliWriter : public DatetimeWriter { - public: - using DatetimeWriter::DatetimeWriter; - - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - Type::type type = data->type()->id(); - int64_t* out_values = this->GetBlockColumnStart(rel_placement); - if (type == Type::DATE32) { - // Convert from days since epoch to datetime64[ms] - ConvertDatetime(*data, out_values); - } else if (type == Type::DATE64) { - ConvertNumericNullable(*data, kPandasTimestampNull, out_values); - } else { - const auto& ts_type = checked_cast(*data->type()); - DCHECK_EQ(TimeUnit::MILLI, ts_type.unit()) - << "Should only call instances of this writer " - << "with arrays of the correct unit"; - ConvertNumericNullable(*data, kPandasTimestampNull, out_values); - } - return Status::OK(); - } -}; - -using DatetimeMicroWriter = DatetimeWriter; - -class DatetimeNanoWriter : public DatetimeWriter { - public: - using DatetimeWriter::DatetimeWriter; - - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - Type::type type = data->type()->id(); - int64_t* out_values = this->GetBlockColumnStart(rel_placement); - compute::ExecContext ctx(options_.pool); - compute::CastOptions options; - if (options_.safe_cast) { - options = compute::CastOptions::Safe(); - } else { - options = compute::CastOptions::Unsafe(); - } - Datum out; - auto target_type = timestamp(TimeUnit::NANO); - - if (type == Type::DATE32) { - // Convert from days since epoch to datetime64[ns] - ConvertDatetime(*data, out_values); - } else if (type == Type::DATE64) { - // Date64Type is millisecond timestamp stored as int64_t - // TODO(wesm): Do we want to make sure to zero out the milliseconds? - ConvertDatetime(*data, out_values); - } else if (type == Type::TIMESTAMP) { - const auto& ts_type = checked_cast(*data->type()); - - if (ts_type.unit() == TimeUnit::NANO) { - ConvertNumericNullable(*data, kPandasTimestampNull, out_values); - } else if (ts_type.unit() == TimeUnit::MICRO || ts_type.unit() == TimeUnit::MILLI || - ts_type.unit() == TimeUnit::SECOND) { - ARROW_ASSIGN_OR_RAISE(out, compute::Cast(data, target_type, options, &ctx)); - ConvertNumericNullable(*out.chunked_array(), kPandasTimestampNull, - out_values); - } else { - return Status::NotImplemented("Unsupported time unit"); - } - } else { - return Status::NotImplemented("Cannot write Arrow data of type ", - data->type()->ToString(), - " to a Pandas datetime block."); - } - return Status::OK(); - } -}; - -template -class DatetimeTZWriter : public BASE { - public: - DatetimeTZWriter(const PandasOptions& options, const std::string& timezone, - int64_t num_rows) - : BASE(options, num_rows, 1), timezone_(timezone) {} - - protected: - Status GetResultBlock(PyObject** out) override { - RETURN_NOT_OK(this->MakeBlock1D()); - *out = this->block_arr_.obj(); - return Status::OK(); - } - - Status AddResultMetadata(PyObject* result) override { - PyObject* py_tz = PyUnicode_FromStringAndSize( - timezone_.c_str(), static_cast(timezone_.size())); - RETURN_IF_PYERROR(); - PyDict_SetItemString(result, "timezone", py_tz); - Py_DECREF(py_tz); - return Status::OK(); - } - - private: - std::string timezone_; -}; - -using DatetimeSecondTZWriter = DatetimeTZWriter; -using DatetimeMilliTZWriter = DatetimeTZWriter; -using DatetimeMicroTZWriter = DatetimeTZWriter; -using DatetimeNanoTZWriter = DatetimeTZWriter; - -template -class TimedeltaWriter : public TypedPandasWriter { - public: - using TypedPandasWriter::TypedPandasWriter; - - Status AllocateTimedelta(int ndim) { - RETURN_NOT_OK(this->AllocateNDArray(NPY_TIMEDELTA, ndim)); - SetDatetimeUnit(internal::NumPyFrequency(UNIT)); - return Status::OK(); - } - - bool CanZeroCopy(const ChunkedArray& data) const override { - const auto& type = checked_cast(*data.type()); - return IsNonNullContiguous(data) && type.unit() == UNIT; - } - - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - const auto& type = checked_cast(*data->type()); - DCHECK_EQ(UNIT, type.unit()) << "Should only call instances of this writer " - << "with arrays of the correct unit"; - ConvertNumericNullable(*data, kPandasTimestampNull, - this->GetBlockColumnStart(rel_placement)); - return Status::OK(); - } - - protected: - Status Allocate() override { return AllocateTimedelta(2); } -}; - -using TimedeltaSecondWriter = TimedeltaWriter; -using TimedeltaMilliWriter = TimedeltaWriter; -using TimedeltaMicroWriter = TimedeltaWriter; - -class TimedeltaNanoWriter : public TimedeltaWriter { - public: - using TimedeltaWriter::TimedeltaWriter; - - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - Type::type type = data->type()->id(); - int64_t* out_values = this->GetBlockColumnStart(rel_placement); - if (type == Type::DURATION) { - const auto& ts_type = checked_cast(*data->type()); - if (ts_type.unit() == TimeUnit::NANO) { - ConvertNumericNullable(*data, kPandasTimestampNull, out_values); - } else if (ts_type.unit() == TimeUnit::MICRO) { - ConvertDatetime(*data, out_values); - } else if (ts_type.unit() == TimeUnit::MILLI) { - ConvertDatetime(*data, out_values); - } else if (ts_type.unit() == TimeUnit::SECOND) { - ConvertDatetime(*data, out_values); - } else { - return Status::NotImplemented("Unsupported time unit"); - } - } else { - return Status::NotImplemented("Cannot write Arrow data of type ", - data->type()->ToString(), - " to a Pandas timedelta block."); - } - return Status::OK(); - } -}; - -Status MakeZeroLengthArray(const std::shared_ptr& type, - std::shared_ptr* out) { - std::unique_ptr builder; - RETURN_NOT_OK(MakeBuilder(default_memory_pool(), type, &builder)); - RETURN_NOT_OK(builder->Resize(0)); - return builder->Finish(out); -} - -bool NeedDictionaryUnification(const ChunkedArray& data) { - if (data.num_chunks() < 2) { - return false; - } - const auto& arr_first = checked_cast(*data.chunk(0)); - for (int c = 1; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - if (!(arr_first.dictionary()->Equals(arr.dictionary()))) { - return true; - } - } - return false; -} - -template -class CategoricalWriter - : public TypedPandasWriter::npy_type> { - public: - using TRAITS = arrow_traits; - using ArrayType = typename TypeTraits::ArrayType; - using T = typename TRAITS::T; - - explicit CategoricalWriter(const PandasOptions& options, int64_t num_rows) - : TypedPandasWriter(options, num_rows, 1), - ordered_(false), - needs_copy_(false) {} - - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - return Status::NotImplemented("categorical type"); - } - - Status TransferSingle(std::shared_ptr data, PyObject* py_ref) override { - const auto& dict_type = checked_cast(*data->type()); - std::shared_ptr dict; - if (data->num_chunks() == 0) { - // no dictionary values => create empty array - RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1)); - RETURN_NOT_OK(MakeZeroLengthArray(dict_type.value_type(), &dict)); - } else { - DCHECK_EQ(IndexType::type_id, dict_type.index_type()->id()); - RETURN_NOT_OK(WriteIndices(*data, &dict)); - } - - PyObject* pydict; - RETURN_NOT_OK(ConvertArrayToPandas(this->options_, dict, nullptr, &pydict)); - dictionary_.reset(pydict); - ordered_ = dict_type.ordered(); - return Status::OK(); - } - - Status Write(std::shared_ptr data, int64_t abs_placement, - int64_t rel_placement) override { - RETURN_NOT_OK(this->EnsurePlacementAllocated()); - RETURN_NOT_OK(TransferSingle(data, /*py_ref=*/nullptr)); - this->placement_data_[rel_placement] = abs_placement; - return Status::OK(); - } - - Status GetSeriesResult(PyObject** out) override { - PyAcquireGIL lock; - - PyObject* result = PyDict_New(); - RETURN_IF_PYERROR(); - - // Expected single array dictionary layout - PyDict_SetItemString(result, "indices", this->block_arr_.obj()); - RETURN_IF_PYERROR(); - RETURN_NOT_OK(AddResultMetadata(result)); - - *out = result; - return Status::OK(); - } - - protected: - Status AddResultMetadata(PyObject* result) override { - PyDict_SetItemString(result, "dictionary", dictionary_.obj()); - PyObject* py_ordered = ordered_ ? Py_True : Py_False; - Py_INCREF(py_ordered); - PyDict_SetItemString(result, "ordered", py_ordered); - return Status::OK(); - } - - Status WriteIndicesUniform(const ChunkedArray& data) { - RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1)); - T* out_values = reinterpret_cast(this->block_data_); - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - const auto& indices = checked_cast(*arr.indices()); - auto values = reinterpret_cast(indices.raw_values()); - - RETURN_NOT_OK(CheckIndexBounds(*indices.data(), arr.dictionary()->length())); - // Null is -1 in CategoricalBlock - for (int i = 0; i < arr.length(); ++i) { - if (indices.IsValid(i)) { - *out_values++ = values[i]; - } else { - *out_values++ = -1; - } - } - } - return Status::OK(); - } - - Status WriteIndicesVarying(const ChunkedArray& data, std::shared_ptr* out_dict) { - // Yield int32 indices to allow for dictionary outgrowing the current index - // type - RETURN_NOT_OK(this->AllocateNDArray(NPY_INT32, 1)); - auto out_values = reinterpret_cast(this->block_data_); - - const auto& dict_type = checked_cast(*data.type()); - - ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(dict_type.value_type(), - this->options_.pool)); - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - const auto& indices = checked_cast(*arr.indices()); - auto values = reinterpret_cast(indices.raw_values()); - - std::shared_ptr transpose_buffer; - RETURN_NOT_OK(unifier->Unify(*arr.dictionary(), &transpose_buffer)); - - auto transpose = reinterpret_cast(transpose_buffer->data()); - int64_t dict_length = arr.dictionary()->length(); - - RETURN_NOT_OK(CheckIndexBounds(*indices.data(), dict_length)); - - // Null is -1 in CategoricalBlock - for (int i = 0; i < arr.length(); ++i) { - if (indices.IsValid(i)) { - *out_values++ = transpose[values[i]]; - } else { - *out_values++ = -1; - } - } - } - - std::shared_ptr unused_type; - return unifier->GetResult(&unused_type, out_dict); - } - - Status WriteIndices(const ChunkedArray& data, std::shared_ptr* out_dict) { - DCHECK_GT(data.num_chunks(), 0); - - // Sniff the first chunk - const auto& arr_first = checked_cast(*data.chunk(0)); - const auto indices_first = std::static_pointer_cast(arr_first.indices()); - - if (data.num_chunks() == 1 && indices_first->null_count() == 0) { - RETURN_NOT_OK( - CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length())); - - PyObject* wrapped; - npy_intp dims[1] = {static_cast(this->num_rows_)}; - RETURN_NOT_OK(MakeNumPyView(indices_first, /*py_ref=*/nullptr, TRAITS::npy_type, - /*ndim=*/1, dims, &wrapped)); - this->SetBlockData(wrapped); - *out_dict = arr_first.dictionary(); - } else { - RETURN_NOT_OK(this->CheckNotZeroCopyOnly(data)); - if (NeedDictionaryUnification(data)) { - RETURN_NOT_OK(WriteIndicesVarying(data, out_dict)); - } else { - RETURN_NOT_OK(WriteIndicesUniform(data)); - *out_dict = arr_first.dictionary(); - } - } - return Status::OK(); - } - - OwnedRefNoGIL dictionary_; - bool ordered_; - bool needs_copy_; -}; - -class ExtensionWriter : public PandasWriter { - public: - using PandasWriter::PandasWriter; - - Status Allocate() override { - // no-op - return Status::OK(); - } - - Status TransferSingle(std::shared_ptr data, PyObject* py_ref) override { - PyAcquireGIL lock; - PyObject* py_array; - py_array = wrap_chunked_array(data); - py_array_.reset(py_array); - - return Status::OK(); - } - - Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { - return TransferSingle(data, nullptr); - } - - Status GetDataFrameResult(PyObject** out) override { - PyAcquireGIL lock; - PyObject* result = PyDict_New(); - RETURN_IF_PYERROR(); - - PyDict_SetItemString(result, "py_array", py_array_.obj()); - PyDict_SetItemString(result, "placement", placement_arr_.obj()); - *out = result; - return Status::OK(); - } - - Status GetSeriesResult(PyObject** out) override { - *out = py_array_.detach(); - return Status::OK(); - } - - protected: - OwnedRefNoGIL py_array_; -}; - -Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type, - const DataType& type, int64_t num_rows, int num_columns, - std::shared_ptr* writer) { -#define BLOCK_CASE(NAME, TYPE) \ - case PandasWriter::NAME: \ - *writer = std::make_shared(options, num_rows, num_columns); \ - break; - -#define CATEGORICAL_CASE(TYPE) \ - case TYPE::type_id: \ - *writer = std::make_shared>(options, num_rows); \ - break; - -#define TZ_CASE(NAME, TYPE) \ - case PandasWriter::NAME: { \ - const auto& ts_type = checked_cast(type); \ - *writer = std::make_shared(options, ts_type.timezone(), num_rows); \ - } break; - - switch (writer_type) { - case PandasWriter::CATEGORICAL: { - const auto& index_type = *checked_cast(type).index_type(); - switch (index_type.id()) { - CATEGORICAL_CASE(Int8Type); - CATEGORICAL_CASE(Int16Type); - CATEGORICAL_CASE(Int32Type); - CATEGORICAL_CASE(Int64Type); - case Type::UINT8: - case Type::UINT16: - case Type::UINT32: - case Type::UINT64: - return Status::TypeError( - "Converting unsigned dictionary indices to pandas", - " not yet supported, index type: ", index_type.ToString()); - default: - // Unreachable - DCHECK(false); - break; - } - } break; - case PandasWriter::EXTENSION: - *writer = std::make_shared(options, num_rows, num_columns); - break; - BLOCK_CASE(OBJECT, ObjectWriter); - BLOCK_CASE(UINT8, UInt8Writer); - BLOCK_CASE(INT8, Int8Writer); - BLOCK_CASE(UINT16, UInt16Writer); - BLOCK_CASE(INT16, Int16Writer); - BLOCK_CASE(UINT32, UInt32Writer); - BLOCK_CASE(INT32, Int32Writer); - BLOCK_CASE(UINT64, UInt64Writer); - BLOCK_CASE(INT64, Int64Writer); - BLOCK_CASE(HALF_FLOAT, Float16Writer); - BLOCK_CASE(FLOAT, Float32Writer); - BLOCK_CASE(DOUBLE, Float64Writer); - BLOCK_CASE(BOOL, BoolWriter); - BLOCK_CASE(DATETIME_DAY, DatetimeDayWriter); - BLOCK_CASE(DATETIME_SECOND, DatetimeSecondWriter); - BLOCK_CASE(DATETIME_MILLI, DatetimeMilliWriter); - BLOCK_CASE(DATETIME_MICRO, DatetimeMicroWriter); - BLOCK_CASE(DATETIME_NANO, DatetimeNanoWriter); - BLOCK_CASE(TIMEDELTA_SECOND, TimedeltaSecondWriter); - BLOCK_CASE(TIMEDELTA_MILLI, TimedeltaMilliWriter); - BLOCK_CASE(TIMEDELTA_MICRO, TimedeltaMicroWriter); - BLOCK_CASE(TIMEDELTA_NANO, TimedeltaNanoWriter); - TZ_CASE(DATETIME_SECOND_TZ, DatetimeSecondTZWriter); - TZ_CASE(DATETIME_MILLI_TZ, DatetimeMilliTZWriter); - TZ_CASE(DATETIME_MICRO_TZ, DatetimeMicroTZWriter); - TZ_CASE(DATETIME_NANO_TZ, DatetimeNanoTZWriter); - default: - return Status::NotImplemented("Unsupported block type"); - } - -#undef BLOCK_CASE -#undef CATEGORICAL_CASE - - return Status::OK(); -} - -static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& options, - PandasWriter::type* output_type) { -#define INTEGER_CASE(NAME) \ - *output_type = \ - data.null_count() > 0 \ - ? options.integer_object_nulls ? PandasWriter::OBJECT : PandasWriter::DOUBLE \ - : PandasWriter::NAME; \ - break; - - switch (data.type()->id()) { - case Type::BOOL: - *output_type = data.null_count() > 0 ? PandasWriter::OBJECT : PandasWriter::BOOL; - break; - case Type::UINT8: - INTEGER_CASE(UINT8); - case Type::INT8: - INTEGER_CASE(INT8); - case Type::UINT16: - INTEGER_CASE(UINT16); - case Type::INT16: - INTEGER_CASE(INT16); - case Type::UINT32: - INTEGER_CASE(UINT32); - case Type::INT32: - INTEGER_CASE(INT32); - case Type::UINT64: - INTEGER_CASE(UINT64); - case Type::INT64: - INTEGER_CASE(INT64); - case Type::HALF_FLOAT: - *output_type = PandasWriter::HALF_FLOAT; - break; - case Type::FLOAT: - *output_type = PandasWriter::FLOAT; - break; - case Type::DOUBLE: - *output_type = PandasWriter::DOUBLE; - break; - case Type::STRING: // fall through - case Type::LARGE_STRING: // fall through - case Type::BINARY: // fall through - case Type::LARGE_BINARY: - case Type::NA: // fall through - case Type::FIXED_SIZE_BINARY: // fall through - case Type::STRUCT: // fall through - case Type::TIME32: // fall through - case Type::TIME64: // fall through - case Type::DECIMAL128: // fall through - case Type::DECIMAL256: // fall through - case Type::INTERVAL_MONTH_DAY_NANO: // fall through - *output_type = PandasWriter::OBJECT; - break; - case Type::DATE32: - if (options.date_as_object) { - *output_type = PandasWriter::OBJECT; - } else if (options.coerce_temporal_nanoseconds) { - *output_type = PandasWriter::DATETIME_NANO; - } else if (options.to_numpy) { - // Numpy supports Day, but Pandas does not - *output_type = PandasWriter::DATETIME_DAY; - } else { - *output_type = PandasWriter::DATETIME_MILLI; - } - break; - case Type::DATE64: - if (options.date_as_object) { - *output_type = PandasWriter::OBJECT; - } else if (options.coerce_temporal_nanoseconds) { - *output_type = PandasWriter::DATETIME_NANO; - } else { - *output_type = PandasWriter::DATETIME_MILLI; - } - break; - case Type::TIMESTAMP: { - const auto& ts_type = checked_cast(*data.type()); - if (options.timestamp_as_object && ts_type.unit() != TimeUnit::NANO) { - // Nanoseconds are never out of bounds for pandas, so in that case - // we don't convert to object - *output_type = PandasWriter::OBJECT; - } else if (options.coerce_temporal_nanoseconds) { - if (!ts_type.timezone().empty()) { - *output_type = PandasWriter::DATETIME_NANO_TZ; - } else { - *output_type = PandasWriter::DATETIME_NANO; - } - } else { - if (!ts_type.timezone().empty()) { - switch (ts_type.unit()) { - case TimeUnit::SECOND: - *output_type = PandasWriter::DATETIME_SECOND_TZ; - break; - case TimeUnit::MILLI: - *output_type = PandasWriter::DATETIME_MILLI_TZ; - break; - case TimeUnit::MICRO: - *output_type = PandasWriter::DATETIME_MICRO_TZ; - break; - case TimeUnit::NANO: - *output_type = PandasWriter::DATETIME_NANO_TZ; - break; - } - } else { - switch (ts_type.unit()) { - case TimeUnit::SECOND: - *output_type = PandasWriter::DATETIME_SECOND; - break; - case TimeUnit::MILLI: - *output_type = PandasWriter::DATETIME_MILLI; - break; - case TimeUnit::MICRO: - *output_type = PandasWriter::DATETIME_MICRO; - break; - case TimeUnit::NANO: - *output_type = PandasWriter::DATETIME_NANO; - break; - } - } - } - } break; - case Type::DURATION: { - const auto& dur_type = checked_cast(*data.type()); - if (options.coerce_temporal_nanoseconds) { - *output_type = PandasWriter::TIMEDELTA_NANO; - } else { - switch (dur_type.unit()) { - case TimeUnit::SECOND: - *output_type = PandasWriter::TIMEDELTA_SECOND; - break; - case TimeUnit::MILLI: - *output_type = PandasWriter::TIMEDELTA_MILLI; - break; - case TimeUnit::MICRO: - *output_type = PandasWriter::TIMEDELTA_MICRO; - break; - case TimeUnit::NANO: - *output_type = PandasWriter::TIMEDELTA_NANO; - break; - } - } - } break; - case Type::FIXED_SIZE_LIST: - case Type::LIST: - case Type::LARGE_LIST: - case Type::MAP: { - auto list_type = std::static_pointer_cast(data.type()); - if (!ListTypeSupported(*list_type->value_type())) { - return Status::NotImplemented("Not implemented type for Arrow list to pandas: ", - list_type->value_type()->ToString()); - } - *output_type = PandasWriter::OBJECT; - } break; - case Type::DICTIONARY: - *output_type = PandasWriter::CATEGORICAL; - break; - case Type::EXTENSION: - *output_type = PandasWriter::EXTENSION; - break; - default: - return Status::NotImplemented( - "No known equivalent Pandas block for Arrow data of type ", - data.type()->ToString(), " is known."); - } - return Status::OK(); -} - -// Construct the exact pandas "BlockManager" memory layout -// -// * For each column determine the correct output pandas type -// * Allocate 2D blocks (ncols x nrows) for each distinct data type in output -// * Allocate block placement arrays -// * Write Arrow columns out into each slice of memory; populate block -// * placement arrays as we go -class PandasBlockCreator { - public: - using WriterMap = std::unordered_map>; - - explicit PandasBlockCreator(const PandasOptions& options, FieldVector fields, - ChunkedArrayVector arrays) - : options_(options), fields_(std::move(fields)), arrays_(std::move(arrays)) { - num_columns_ = static_cast(arrays_.size()); - if (num_columns_ > 0) { - num_rows_ = arrays_[0]->length(); - } - column_block_placement_.resize(num_columns_); - } - virtual ~PandasBlockCreator() = default; - - virtual Status Convert(PyObject** out) = 0; - - Status AppendBlocks(const WriterMap& blocks, PyObject* list) { - for (const auto& it : blocks) { - PyObject* item; - RETURN_NOT_OK(it.second->GetDataFrameResult(&item)); - if (PyList_Append(list, item) < 0) { - RETURN_IF_PYERROR(); - } - - // ARROW-1017; PyList_Append increments object refcount - Py_DECREF(item); - } - return Status::OK(); - } - - protected: - PandasOptions options_; - - FieldVector fields_; - ChunkedArrayVector arrays_; - int num_columns_; - int64_t num_rows_; - - // column num -> relative placement within internal block - std::vector column_block_placement_; -}; - -// Helper function for extension chunked arrays -// Constructing a storage chunked array of an extension chunked array -std::shared_ptr GetStorageChunkedArray(std::shared_ptr arr) { - auto value_type = checked_cast(*arr->type()).storage_type(); - ArrayVector storage_arrays; - for (int c = 0; c < arr->num_chunks(); c++) { - const auto& arr_ext = checked_cast(*arr->chunk(c)); - storage_arrays.emplace_back(arr_ext.storage()); - } - return std::make_shared(std::move(storage_arrays), value_type); -}; - -class ConsolidatedBlockCreator : public PandasBlockCreator { - public: - using PandasBlockCreator::PandasBlockCreator; - - Status Convert(PyObject** out) override { - column_types_.resize(num_columns_); - RETURN_NOT_OK(CreateBlocks()); - RETURN_NOT_OK(WriteTableToBlocks()); - PyAcquireGIL lock; - - PyObject* result = PyList_New(0); - RETURN_IF_PYERROR(); - - RETURN_NOT_OK(AppendBlocks(blocks_, result)); - RETURN_NOT_OK(AppendBlocks(singleton_blocks_, result)); - - *out = result; - return Status::OK(); - } - - Status GetBlockType(int column_index, PandasWriter::type* out) { - if (options_.extension_columns.count(fields_[column_index]->name())) { - *out = PandasWriter::EXTENSION; - return Status::OK(); - } else { - // In case of an extension array default to the storage type - if (arrays_[column_index]->type()->id() == Type::EXTENSION) { - arrays_[column_index] = GetStorageChunkedArray(arrays_[column_index]); - } - return GetPandasWriterType(*arrays_[column_index], options_, out); - } - } - - Status CreateBlocks() { - for (int i = 0; i < num_columns_; ++i) { - const DataType& type = *arrays_[i]->type(); - PandasWriter::type output_type; - RETURN_NOT_OK(GetBlockType(i, &output_type)); - - int block_placement = 0; - std::shared_ptr writer; - if (output_type == PandasWriter::CATEGORICAL || - output_type == PandasWriter::DATETIME_SECOND_TZ || - output_type == PandasWriter::DATETIME_MILLI_TZ || - output_type == PandasWriter::DATETIME_MICRO_TZ || - output_type == PandasWriter::DATETIME_NANO_TZ || - output_type == PandasWriter::EXTENSION) { - RETURN_NOT_OK(MakeWriter(options_, output_type, type, num_rows_, - /*num_columns=*/1, &writer)); - singleton_blocks_[i] = writer; - } else { - auto it = block_sizes_.find(output_type); - if (it != block_sizes_.end()) { - block_placement = it->second; - // Increment count - ++it->second; - } else { - // Add key to map - block_sizes_[output_type] = 1; - } - } - column_types_[i] = output_type; - column_block_placement_[i] = block_placement; - } - - // Create normal non-categorical blocks - for (const auto& it : this->block_sizes_) { - PandasWriter::type output_type = static_cast(it.first); - std::shared_ptr block; - RETURN_NOT_OK(MakeWriter(this->options_, output_type, /*unused*/ *null(), num_rows_, - it.second, &block)); - this->blocks_[output_type] = block; - } - return Status::OK(); - } - - Status GetWriter(int i, std::shared_ptr* block) { - PandasWriter::type output_type = this->column_types_[i]; - switch (output_type) { - case PandasWriter::CATEGORICAL: - case PandasWriter::DATETIME_SECOND_TZ: - case PandasWriter::DATETIME_MILLI_TZ: - case PandasWriter::DATETIME_MICRO_TZ: - case PandasWriter::DATETIME_NANO_TZ: - case PandasWriter::EXTENSION: { - auto it = this->singleton_blocks_.find(i); - if (it == this->singleton_blocks_.end()) { - return Status::KeyError("No block allocated"); - } - *block = it->second; - } break; - default: - auto it = this->blocks_.find(output_type); - if (it == this->blocks_.end()) { - return Status::KeyError("No block allocated"); - } - *block = it->second; - break; - } - return Status::OK(); - } - - Status WriteTableToBlocks() { - auto WriteColumn = [this](int i) { - std::shared_ptr block; - RETURN_NOT_OK(this->GetWriter(i, &block)); - // ARROW-3789 Use std::move on the array to permit self-destructing - return block->Write(std::move(arrays_[i]), i, this->column_block_placement_[i]); - }; - - return OptionalParallelFor(options_.use_threads, num_columns_, WriteColumn); - } - - private: - // column num -> block type id - std::vector column_types_; - - // block type -> type count - std::unordered_map block_sizes_; - std::unordered_map block_types_; - - // block type -> block - WriterMap blocks_; - - WriterMap singleton_blocks_; -}; - -/// \brief Create blocks for pandas.DataFrame block manager using one block per -/// column strategy. This permits some zero-copy optimizations as well as the -/// ability for the table to "self-destruct" if selected by the user. -class SplitBlockCreator : public PandasBlockCreator { - public: - using PandasBlockCreator::PandasBlockCreator; - - Status GetWriter(int i, std::shared_ptr* writer) { - PandasWriter::type output_type = PandasWriter::OBJECT; - const DataType& type = *arrays_[i]->type(); - if (options_.extension_columns.count(fields_[i]->name())) { - output_type = PandasWriter::EXTENSION; - } else { - // Null count needed to determine output type - RETURN_NOT_OK(GetPandasWriterType(*arrays_[i], options_, &output_type)); - } - return MakeWriter(this->options_, output_type, type, num_rows_, 1, writer); - } - - Status Convert(PyObject** out) override { - PyAcquireGIL lock; - - PyObject* result = PyList_New(0); - RETURN_IF_PYERROR(); - - for (int i = 0; i < num_columns_; ++i) { - std::shared_ptr writer; - RETURN_NOT_OK(GetWriter(i, &writer)); - // ARROW-3789 Use std::move on the array to permit self-destructing - RETURN_NOT_OK(writer->Write(std::move(arrays_[i]), i, /*rel_placement=*/0)); - - PyObject* item; - RETURN_NOT_OK(writer->GetDataFrameResult(&item)); - if (PyList_Append(result, item) < 0) { - RETURN_IF_PYERROR(); - } - // PyList_Append increments object refcount - Py_DECREF(item); - } - - *out = result; - return Status::OK(); - } - - private: - std::vector> writers_; -}; - -Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arrays, - FieldVector* fields) { - std::vector columns_to_encode; - - // For Categorical conversions - auto EncodeColumn = [&](int j) { - int i = columns_to_encode[j]; - if (options.zero_copy_only) { - return Status::Invalid("Need to dictionary encode a column, but ", - "only zero-copy conversions allowed"); - } - compute::ExecContext ctx(options.pool); - ARROW_ASSIGN_OR_RAISE( - Datum out, DictionaryEncode((*arrays)[i], - compute::DictionaryEncodeOptions::Defaults(), &ctx)); - (*arrays)[i] = out.chunked_array(); - (*fields)[i] = (*fields)[i]->WithType((*arrays)[i]->type()); - return Status::OK(); - }; - - if (!options.categorical_columns.empty()) { - for (int i = 0; i < static_cast(arrays->size()); i++) { - if ((*arrays)[i]->type()->id() != Type::DICTIONARY && - options.categorical_columns.count((*fields)[i]->name())) { - columns_to_encode.push_back(i); - } - } - } - if (options.strings_to_categorical) { - for (int i = 0; i < static_cast(arrays->size()); i++) { - if (is_base_binary_like((*arrays)[i]->type()->id())) { - columns_to_encode.push_back(i); - } - } - } - return OptionalParallelFor(options.use_threads, - static_cast(columns_to_encode.size()), EncodeColumn); -} - -} // namespace - -Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr arr, - PyObject* py_ref, PyObject** out) { - return ConvertChunkedArrayToPandas( - options, std::make_shared(std::move(arr)), py_ref, out); -} - -Status ConvertChunkedArrayToPandas(const PandasOptions& options, - std::shared_ptr arr, PyObject* py_ref, - PyObject** out) { - if (options.decode_dictionaries && arr->type()->id() == Type::DICTIONARY) { - const auto& dense_type = - checked_cast(*arr->type()).value_type(); - RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &arr)); - DCHECK_NE(arr->type()->id(), Type::DICTIONARY); - - // The original Python DictionaryArray won't own the memory anymore - // as we actually built a new array when we decoded the DictionaryArray - // thus let the final resulting numpy array own the memory through a Capsule - py_ref = nullptr; - } - - if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) { - if (options.zero_copy_only) { - return Status::Invalid("Need to dictionary encode a column, but ", - "only zero-copy conversions allowed"); - } - compute::ExecContext ctx(options.pool); - ARROW_ASSIGN_OR_RAISE( - Datum out, - DictionaryEncode(arr, compute::DictionaryEncodeOptions::Defaults(), &ctx)); - arr = out.chunked_array(); - } - - PandasOptions modified_options = options; - modified_options.strings_to_categorical = false; - - // ARROW-7596: We permit the hybrid Series/DataFrame code path to do zero copy - // optimizations that we do not allow in the default case when converting - // Table->DataFrame - modified_options.allow_zero_copy_blocks = true; - - // In case of an extension array default to the storage type - if (arr->type()->id() == Type::EXTENSION) { - arr = GetStorageChunkedArray(arr); - } - - PandasWriter::type output_type; - RETURN_NOT_OK(GetPandasWriterType(*arr, modified_options, &output_type)); - if (options.decode_dictionaries) { - DCHECK_NE(output_type, PandasWriter::CATEGORICAL); - } - - std::shared_ptr writer; - RETURN_NOT_OK(MakeWriter(modified_options, output_type, *arr->type(), arr->length(), - /*num_columns=*/1, &writer)); - RETURN_NOT_OK(writer->TransferSingle(std::move(arr), py_ref)); - return writer->GetSeriesResult(out); -} - -Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr table, - PyObject** out) { - ChunkedArrayVector arrays = table->columns(); - FieldVector fields = table->fields(); - - // ARROW-3789: allow "self-destructing" by releasing references to columns as - // we convert them to pandas - table = nullptr; - - RETURN_NOT_OK(ConvertCategoricals(options, &arrays, &fields)); - - PandasOptions modified_options = options; - modified_options.strings_to_categorical = false; - modified_options.categorical_columns.clear(); - - if (options.split_blocks) { - modified_options.allow_zero_copy_blocks = true; - SplitBlockCreator helper(modified_options, std::move(fields), std::move(arrays)); - return helper.Convert(out); - } else { - ConsolidatedBlockCreator helper(modified_options, std::move(fields), - std::move(arrays)); - return helper.Convert(out); - } -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.h b/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.h deleted file mode 100644 index 82e0a60..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.h +++ /dev/null @@ -1,146 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between pandas's NumPy-based data representation -// and Arrow data structures - -#pragma once - -#include "arrow/python/platform.h" - -#include -#include -#include - -#include "arrow/memory_pool.h" -#include "arrow/python/visibility.h" - -namespace arrow { - -class Array; -class ChunkedArray; -class Column; -class DataType; -class MemoryPool; -class Status; -class Table; - -namespace py { - -enum class MapConversionType { - DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas - LOSSY, // report warnings when lossiness is encountered due to duplicate keys - STRICT_, // raise a Python exception when lossiness is encountered due to duplicate - // keys -}; - -struct PandasOptions { - /// arrow::MemoryPool to use for memory allocations - MemoryPool* pool = default_memory_pool(); - - /// If true, we will convert all string columns to categoricals - bool strings_to_categorical = false; - bool zero_copy_only = false; - bool integer_object_nulls = false; - bool date_as_object = false; - bool timestamp_as_object = false; - bool use_threads = false; - - /// Coerce all date and timestamp to datetime64[ns] - bool coerce_temporal_nanoseconds = false; - - /// Used to maintain backwards compatibility for - /// timezone bugs (see ARROW-9528). Should be removed - /// after Arrow 2.0 release. - bool ignore_timezone = false; - - /// \brief If true, do not create duplicate PyObject versions of equal - /// objects. This only applies to immutable objects like strings or datetime - /// objects - bool deduplicate_objects = false; - - /// \brief For certain data types, a cast is needed in order to store the - /// data in a pandas DataFrame or Series (e.g. timestamps are always stored - /// as nanoseconds in pandas). This option controls whether it is a safe - /// cast or not. - bool safe_cast = true; - - /// \brief If true, create one block per column rather than consolidated - /// blocks (1 per data type). Do zero-copy wrapping when there are no - /// nulls. pandas currently will consolidate the blocks on its own, causing - /// increased memory use, so keep this in mind if you are working on a - /// memory-constrained situation. - bool split_blocks = false; - - /// \brief If true, allow non-writable zero-copy views to be created for - /// single column blocks. This option is also used to provide zero copy for - /// Series data - bool allow_zero_copy_blocks = false; - - /// \brief If true, attempt to deallocate buffers in passed Arrow object if - /// it is the only remaining shared_ptr copy of it. See ARROW-3789 for - /// original context for this feature. Only currently implemented for Table - /// conversions - bool self_destruct = false; - - /// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to - /// Python association lists (list-of-tuples) in the same order as the Arrow - /// Map, as in [(key1, value1), (key2, value2), ...] - /// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts. - /// This can change the ordering of (key, value) pairs, and will deduplicate - /// multiple keys, resulting in a possible loss of data. - /// If 'lossy', this key deduplication results in a warning printed - /// when detected. If 'strict', this instead results in an exception - /// being raised when detected. - MapConversionType maps_as_pydicts = MapConversionType::DEFAULT; - - // Used internally for nested arrays. - bool decode_dictionaries = false; - - // Columns that should be casted to categorical - std::unordered_set categorical_columns; - - // Columns that should be passed through to be converted to - // ExtensionArray/Block - std::unordered_set extension_columns; - - // Used internally to decipher between to_numpy() and to_pandas() when - // the expected output differs - bool to_numpy = false; -}; - -ARROW_PYTHON_EXPORT -Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr arr, - PyObject* py_ref, PyObject** out); - -ARROW_PYTHON_EXPORT -Status ConvertChunkedArrayToPandas(const PandasOptions& options, - std::shared_ptr col, PyObject* py_ref, - PyObject** out); - -// Convert a whole table as efficiently as possible to a pandas.DataFrame. -// -// The returned Python object is a list of tuples consisting of the exact 2D -// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x. -// -// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) -ARROW_PYTHON_EXPORT -Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr
table, - PyObject** out); - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_python_internal.h b/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_python_internal.h deleted file mode 100644 index 514cda3..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_python_internal.h +++ /dev/null @@ -1,49 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/array.h" -#include "arrow/python/platform.h" - -namespace arrow { -namespace py { -namespace internal { -// TODO(ARROW-12976): See if we can refactor Pandas ObjectWriter logic -// to the .cc file and move this there as well if we can. - -// Converts array to a sequency of python objects. -template -inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func, - Assigner out_values) { - // TODO(ARROW-12976): Use visitor here? - const bool has_nulls = arr.null_count() > 0; - for (int64_t i = 0; i < arr.length(); ++i) { - if (has_nulls && arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - RETURN_NOT_OK(write_func(arr.GetView(i), out_values)); - } - ++out_values; - } - return Status::OK(); -} - -} // namespace internal -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.cc deleted file mode 100644 index 6dcc959..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.cc +++ /dev/null @@ -1,38 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/benchmark.h" -#include "arrow/python/helpers.h" - -namespace arrow { -namespace py { -namespace benchmark { - -void Benchmark_PandasObjectIsNull(PyObject* list) { - if (!PyList_CheckExact(list)) { - PyErr_SetString(PyExc_TypeError, "expected a list"); - return; - } - Py_ssize_t i, n = PyList_GET_SIZE(list); - for (i = 0; i < n; i++) { - internal::PandasObjectIsNull(PyList_GET_ITEM(list, i)); - } -} - -} // namespace benchmark -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.h b/src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.h deleted file mode 100644 index 8060dd3..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.h +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/python/platform.h" - -#include "arrow/python/visibility.h" - -namespace arrow { -namespace py { -namespace benchmark { - -// Micro-benchmark routines for use from ASV - -// Run PandasObjectIsNull() once over every object in *list* -ARROW_PYTHON_EXPORT -void Benchmark_PandasObjectIsNull(PyObject* list); - -} // namespace benchmark -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/common.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/common.cc deleted file mode 100644 index 6fe2ed4..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/common.cc +++ /dev/null @@ -1,203 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/common.h" - -#include -#include -#include - -#include "arrow/memory_pool.h" -#include "arrow/status.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/logging.h" - -#include "arrow/python/helpers.h" - -namespace arrow { - -using internal::checked_cast; - -namespace py { - -static std::mutex memory_pool_mutex; -static MemoryPool* default_python_pool = nullptr; - -void set_default_memory_pool(MemoryPool* pool) { - std::lock_guard guard(memory_pool_mutex); - default_python_pool = pool; -} - -MemoryPool* get_memory_pool() { - std::lock_guard guard(memory_pool_mutex); - if (default_python_pool) { - return default_python_pool; - } else { - return default_memory_pool(); - } -} - -// ---------------------------------------------------------------------- -// PythonErrorDetail - -namespace { - -const char kErrorDetailTypeId[] = "arrow::py::PythonErrorDetail"; - -// Try to match the Python exception type with an appropriate Status code -StatusCode MapPyError(PyObject* exc_type) { - StatusCode code; - - if (PyErr_GivenExceptionMatches(exc_type, PyExc_MemoryError)) { - code = StatusCode::OutOfMemory; - } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_IndexError)) { - code = StatusCode::IndexError; - } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_KeyError)) { - code = StatusCode::KeyError; - } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_TypeError)) { - code = StatusCode::TypeError; - } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_ValueError) || - PyErr_GivenExceptionMatches(exc_type, PyExc_OverflowError)) { - code = StatusCode::Invalid; - } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_EnvironmentError)) { - code = StatusCode::IOError; - } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_NotImplementedError)) { - code = StatusCode::NotImplemented; - } else { - code = StatusCode::UnknownError; - } - return code; -} - -// PythonErrorDetail indicates a Python exception was raised. -class PythonErrorDetail : public StatusDetail { - public: - const char* type_id() const override { return kErrorDetailTypeId; } - - std::string ToString() const override { - // This is simple enough not to need the GIL - const auto ty = reinterpret_cast(exc_type_.obj()); - // XXX Should we also print traceback? - return std::string("Python exception: ") + ty->tp_name; - } - - void RestorePyError() const { - Py_INCREF(exc_type_.obj()); - Py_INCREF(exc_value_.obj()); - Py_INCREF(exc_traceback_.obj()); - PyErr_Restore(exc_type_.obj(), exc_value_.obj(), exc_traceback_.obj()); - } - - PyObject* exc_type() const { return exc_type_.obj(); } - - PyObject* exc_value() const { return exc_value_.obj(); } - - static std::shared_ptr FromPyError() { - PyObject* exc_type = nullptr; - PyObject* exc_value = nullptr; - PyObject* exc_traceback = nullptr; - - PyErr_Fetch(&exc_type, &exc_value, &exc_traceback); - PyErr_NormalizeException(&exc_type, &exc_value, &exc_traceback); - ARROW_CHECK(exc_type) - << "PythonErrorDetail::FromPyError called without a Python error set"; - DCHECK(PyType_Check(exc_type)); - DCHECK(exc_value); // Ensured by PyErr_NormalizeException, double-check - if (exc_traceback == nullptr) { - // Needed by PyErr_Restore() - Py_INCREF(Py_None); - exc_traceback = Py_None; - } - - std::shared_ptr detail(new PythonErrorDetail); - detail->exc_type_.reset(exc_type); - detail->exc_value_.reset(exc_value); - detail->exc_traceback_.reset(exc_traceback); - return detail; - } - - protected: - PythonErrorDetail() = default; - - OwnedRefNoGIL exc_type_, exc_value_, exc_traceback_; -}; - -} // namespace - -// ---------------------------------------------------------------------- -// Python exception <-> Status - -Status ConvertPyError(StatusCode code) { - auto detail = PythonErrorDetail::FromPyError(); - if (code == StatusCode::UnknownError) { - code = MapPyError(detail->exc_type()); - } - - std::string message; - RETURN_NOT_OK(internal::PyObject_StdStringStr(detail->exc_value(), &message)); - return Status(code, message, detail); -} - -bool IsPyError(const Status& status) { - if (status.ok()) { - return false; - } - auto detail = status.detail(); - bool result = detail != nullptr && detail->type_id() == kErrorDetailTypeId; - return result; -} - -void RestorePyError(const Status& status) { - ARROW_CHECK(IsPyError(status)); - const auto& detail = checked_cast(*status.detail()); - detail.RestorePyError(); -} - -// ---------------------------------------------------------------------- -// PyBuffer - -PyBuffer::PyBuffer() : Buffer(nullptr, 0) {} - -Status PyBuffer::Init(PyObject* obj) { - if (!PyObject_GetBuffer(obj, &py_buf_, PyBUF_ANY_CONTIGUOUS)) { - data_ = reinterpret_cast(py_buf_.buf); - ARROW_CHECK_NE(data_, nullptr) << "Null pointer in Py_buffer"; - size_ = py_buf_.len; - capacity_ = py_buf_.len; - is_mutable_ = !py_buf_.readonly; - return Status::OK(); - } else { - return ConvertPyError(StatusCode::Invalid); - } -} - -Result> PyBuffer::FromPyObject(PyObject* obj) { - PyBuffer* buf = new PyBuffer(); - std::shared_ptr res(buf); - RETURN_NOT_OK(buf->Init(obj)); - return res; -} - -PyBuffer::~PyBuffer() { - if (data_ != nullptr) { - PyAcquireGIL lock; - PyBuffer_Release(&py_buf_); - } -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/common.h b/src/vendored/apache-arrow-12.0.1/arrow/python/common.h deleted file mode 100644 index bfd11ba..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/common.h +++ /dev/null @@ -1,366 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/python/pyarrow.h" -#include "arrow/python/visibility.h" -#include "arrow/result.h" -#include "arrow/util/macros.h" - -namespace arrow { - -class MemoryPool; -template -class Result; - -namespace py { - -// Convert current Python error to a Status. The Python error state is cleared -// and can be restored with RestorePyError(). -ARROW_PYTHON_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError); -// Query whether the given Status is a Python error (as wrapped by ConvertPyError()). -ARROW_PYTHON_EXPORT bool IsPyError(const Status& status); -// Restore a Python error wrapped in a Status. -ARROW_PYTHON_EXPORT void RestorePyError(const Status& status); - -// Catch a pending Python exception and return the corresponding Status. -// If no exception is pending, Status::OK() is returned. -inline Status CheckPyError(StatusCode code = StatusCode::UnknownError) { - if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { - return Status::OK(); - } else { - return ConvertPyError(code); - } -} - -#define RETURN_IF_PYERROR() ARROW_RETURN_NOT_OK(CheckPyError()) - -#define PY_RETURN_IF_ERROR(CODE) ARROW_RETURN_NOT_OK(CheckPyError(CODE)) - -// For Cython, as you can't define template C++ functions in Cython, only use them. -// This function can set a Python exception. It assumes that T has a (cheap) -// default constructor. -template -T GetResultValue(Result result) { - if (ARROW_PREDICT_TRUE(result.ok())) { - return *std::move(result); - } else { - int r = internal::check_status(result.status()); // takes the GIL - assert(r == -1); // should have errored out - ARROW_UNUSED(r); - return {}; - } -} - -// A RAII-style helper that ensures the GIL is acquired inside a lexical block. -class ARROW_PYTHON_EXPORT PyAcquireGIL { - public: - PyAcquireGIL() : acquired_gil_(false) { acquire(); } - - ~PyAcquireGIL() { release(); } - - void acquire() { - if (!acquired_gil_) { - state_ = PyGILState_Ensure(); - acquired_gil_ = true; - } - } - - // idempotent - void release() { - if (acquired_gil_) { - PyGILState_Release(state_); - acquired_gil_ = false; - } - } - - private: - bool acquired_gil_; - PyGILState_STATE state_; - ARROW_DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL); -}; - -// A RAII-style helper that releases the GIL until the end of a lexical block -class ARROW_PYTHON_EXPORT PyReleaseGIL { - public: - PyReleaseGIL() { saved_state_ = PyEval_SaveThread(); } - - ~PyReleaseGIL() { PyEval_RestoreThread(saved_state_); } - - private: - PyThreadState* saved_state_; - ARROW_DISALLOW_COPY_AND_ASSIGN(PyReleaseGIL); -}; - -// A helper to call safely into the Python interpreter from arbitrary C++ code. -// The GIL is acquired, and the current thread's error status is preserved. -template -auto SafeCallIntoPython(Function&& func) -> decltype(func()) { - PyAcquireGIL lock; - PyObject* exc_type; - PyObject* exc_value; - PyObject* exc_traceback; - PyErr_Fetch(&exc_type, &exc_value, &exc_traceback); - auto maybe_status = std::forward(func)(); - // If the return Status is a "Python error", the current Python error status - // describes the error and shouldn't be clobbered. - if (!IsPyError(::arrow::internal::GenericToStatus(maybe_status)) && - exc_type != NULLPTR) { - PyErr_Restore(exc_type, exc_value, exc_traceback); - } - return maybe_status; -} - -// A RAII primitive that DECREFs the underlying PyObject* when it -// goes out of scope. -class ARROW_PYTHON_EXPORT OwnedRef { - public: - OwnedRef() : obj_(NULLPTR) {} - OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {} - explicit OwnedRef(PyObject* obj) : obj_(obj) {} - - OwnedRef& operator=(OwnedRef&& other) { - obj_ = other.detach(); - return *this; - } - - ~OwnedRef() { reset(); } - - void reset(PyObject* obj) { - Py_XDECREF(obj_); - obj_ = obj; - } - - void reset() { reset(NULLPTR); } - - PyObject* detach() { - PyObject* result = obj_; - obj_ = NULLPTR; - return result; - } - - PyObject* obj() const { return obj_; } - - PyObject** ref() { return &obj_; } - - operator bool() const { return obj_ != NULLPTR; } - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(OwnedRef); - - PyObject* obj_; -}; - -// Same as OwnedRef, but ensures the GIL is taken when it goes out of scope. -// This is for situations where the GIL is not always known to be held -// (e.g. if it is released in the middle of a function for performance reasons) -class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef { - public: - OwnedRefNoGIL() : OwnedRef() {} - OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {} - explicit OwnedRefNoGIL(PyObject* obj) : OwnedRef(obj) {} - - ~OwnedRefNoGIL() { - // This destructor may be called after the Python interpreter is finalized. - // At least avoid spurious attempts to take the GIL when not necessary. - if (obj() == NULLPTR) { - return; - } - PyAcquireGIL lock; - reset(); - } -}; - -template -struct BoundFunction; - -template -struct BoundFunction { - // We bind `cdef void fn(object, ...)` to get a `Status(...)` - // where the Status contains any Python error raised by `fn` - using Unbound = void(PyObject*, Args...); - using Bound = Status(Args...); - - BoundFunction(Unbound* unbound, PyObject* bound_arg) - : unbound_(unbound), bound_arg_(bound_arg) {} - - Status Invoke(Args... args) const { - PyAcquireGIL lock; - unbound_(bound_arg_.obj(), std::forward(args)...); - RETURN_IF_PYERROR(); - return Status::OK(); - } - - Unbound* unbound_; - OwnedRefNoGIL bound_arg_; -}; - -template -struct BoundFunction { - // We bind `cdef Return fn(object, ...)` to get a `Result(...)` - // where the Result contains any Python error raised by `fn` or the - // return value from `fn`. - using Unbound = Return(PyObject*, Args...); - using Bound = Result(Args...); - - BoundFunction(Unbound* unbound, PyObject* bound_arg) - : unbound_(unbound), bound_arg_(bound_arg) {} - - Result Invoke(Args... args) const { - PyAcquireGIL lock; - Return ret = unbound_(bound_arg_.obj(), std::forward(args)...); - RETURN_IF_PYERROR(); - return ret; - } - - Unbound* unbound_; - OwnedRefNoGIL bound_arg_; -}; - -template -std::function BindFunction(Return (*unbound)(PyObject*, Args...), - PyObject* bound_arg) { - using Fn = BoundFunction; - - static_assert(std::is_same::value, - "requested bound function of unsupported type"); - - Py_XINCREF(bound_arg); - auto bound_fn = std::make_shared(unbound, bound_arg); - return - [bound_fn](Args... args) { return bound_fn->Invoke(std::forward(args)...); }; -} - -// A temporary conversion of a Python object to a bytes area. -struct PyBytesView { - const char* bytes; - Py_ssize_t size; - bool is_utf8; - - static Result FromString(PyObject* obj, bool check_utf8 = false) { - PyBytesView self; - ARROW_RETURN_NOT_OK(self.ParseString(obj, check_utf8)); - return std::move(self); - } - - static Result FromUnicode(PyObject* obj) { - PyBytesView self; - ARROW_RETURN_NOT_OK(self.ParseUnicode(obj)); - return std::move(self); - } - - static Result FromBinary(PyObject* obj) { - PyBytesView self; - ARROW_RETURN_NOT_OK(self.ParseBinary(obj)); - return std::move(self); - } - - // View the given Python object as string-like, i.e. str or (utf8) bytes - Status ParseString(PyObject* obj, bool check_utf8 = false) { - if (PyUnicode_Check(obj)) { - return ParseUnicode(obj); - } else { - ARROW_RETURN_NOT_OK(ParseBinary(obj)); - if (check_utf8) { - // Check the bytes are utf8 utf-8 - OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size)); - if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { - is_utf8 = true; - } else { - PyErr_Clear(); - is_utf8 = false; - } - } - return Status::OK(); - } - } - - // View the given Python object as unicode string - Status ParseUnicode(PyObject* obj) { - // The utf-8 representation is cached on the unicode object - bytes = PyUnicode_AsUTF8AndSize(obj, &size); - RETURN_IF_PYERROR(); - is_utf8 = true; - return Status::OK(); - } - - // View the given Python object as binary-like, i.e. bytes - Status ParseBinary(PyObject* obj) { - if (PyBytes_Check(obj)) { - bytes = PyBytes_AS_STRING(obj); - size = PyBytes_GET_SIZE(obj); - is_utf8 = false; - } else if (PyByteArray_Check(obj)) { - bytes = PyByteArray_AS_STRING(obj); - size = PyByteArray_GET_SIZE(obj); - is_utf8 = false; - } else if (PyMemoryView_Check(obj)) { - PyObject* ref = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C'); - RETURN_IF_PYERROR(); - Py_buffer* buffer = PyMemoryView_GET_BUFFER(ref); - bytes = reinterpret_cast(buffer->buf); - size = buffer->len; - is_utf8 = false; - } else { - return Status::TypeError("Expected bytes, got a '", Py_TYPE(obj)->tp_name, - "' object"); - } - return Status::OK(); - } - - protected: - OwnedRef ref; -}; - -class ARROW_PYTHON_EXPORT PyBuffer : public Buffer { - public: - /// While memoryview objects support multi-dimensional buffers, PyBuffer only supports - /// one-dimensional byte buffers. - ~PyBuffer(); - - static Result> FromPyObject(PyObject* obj); - - private: - PyBuffer(); - Status Init(PyObject*); - - Py_buffer py_buf_; -}; - -// Return the common PyArrow memory pool -ARROW_PYTHON_EXPORT void set_default_memory_pool(MemoryPool* pool); -ARROW_PYTHON_EXPORT MemoryPool* get_memory_pool(); - -// This is annoying: because C++11 does not allow implicit conversion of string -// literals to non-const char*, we need to go through some gymnastics to use -// PyObject_CallMethod without a lot of pain (its arguments are non-const -// char*) -template -static inline PyObject* cpp_PyObject_CallMethod(PyObject* obj, const char* method_name, - const char* argspec, ArgTypes... args) { - return PyObject_CallMethod(obj, const_cast(method_name), - const_cast(argspec), args...); -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/csv.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/csv.cc deleted file mode 100644 index 1df3a94..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/csv.cc +++ /dev/null @@ -1,62 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "csv.h" - -#include - -#include "arrow/python/common.h" - -namespace arrow { - -using csv::InvalidRow; -using csv::InvalidRowHandler; -using csv::InvalidRowResult; - -namespace py { -namespace csv { - -InvalidRowHandler MakeInvalidRowHandler(PyInvalidRowCallback cb, PyObject* py_handler) { - if (cb == nullptr) { - return InvalidRowHandler{}; - } - - struct Handler { - PyInvalidRowCallback cb; - std::shared_ptr handler_ref; - - InvalidRowResult operator()(const InvalidRow& invalid_row) { - InvalidRowResult result; - auto st = SafeCallIntoPython([&]() -> Status { - result = cb(handler_ref->obj(), invalid_row); - if (PyErr_Occurred()) { - PyErr_WriteUnraisable(handler_ref->obj()); - } - return Status::OK(); - }); - ARROW_UNUSED(st); - return result; - } - }; - - Py_INCREF(py_handler); - return Handler{cb, std::make_shared(py_handler)}; -} - -} // namespace csv -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/csv.h b/src/vendored/apache-arrow-12.0.1/arrow/python/csv.h deleted file mode 100644 index 34302e9..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/csv.h +++ /dev/null @@ -1,42 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include - -#include "arrow/csv/options.h" -#include "arrow/python/common.h" -#include "arrow/util/macros.h" - -namespace arrow { -namespace py { -namespace csv { - -using PyInvalidRowCallback = std::function<::arrow::csv::InvalidRowResult( - PyObject*, const ::arrow::csv::InvalidRow&)>; - -ARROW_PYTHON_EXPORT -::arrow::csv::InvalidRowHandler MakeInvalidRowHandler(PyInvalidRowCallback, - PyObject* handler); - -} // namespace csv -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.cc deleted file mode 100644 index 0e817dd..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.cc +++ /dev/null @@ -1,663 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -#include "datetime.h" - -#include -#include -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/python/arrow_to_python_internal.h" -#include "arrow/python/common.h" -#include "arrow/python/helpers.h" -#include "arrow/python/platform.h" -#include "arrow/scalar.h" -#include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/util/logging.h" -#include "arrow/util/regex.h" -#include "arrow/util/value_parsing.h" - -namespace arrow { - -using internal::RegexMatch; - -namespace py { -namespace internal { - -namespace { - -bool MatchFixedOffset(const std::string& tz, std::string_view* sign, - std::string_view* hour, std::string_view* minute) { - static const std::regex regex("^([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$"); - if (tz.size() < 5) { - return false; - } - return RegexMatch(regex, tz, {sign, hour, minute}); -} - -constexpr char* NonConst(const char* st) { - // Hack for python versions < 3.7 where members of PyStruct members - // where non-const (C++ doesn't like assigning string literals to these types) - return const_cast(st); -} - -static PyTypeObject MonthDayNanoTupleType = {}; - -static PyStructSequence_Field MonthDayNanoField[] = { - {NonConst("months"), NonConst("The number of months in the interval")}, - {NonConst("days"), NonConst("The number days in the interval")}, - {NonConst("nanoseconds"), NonConst("The number of nanoseconds in the interval")}, - {nullptr, nullptr}}; - -static PyStructSequence_Desc MonthDayNanoTupleDesc = { - NonConst("MonthDayNano"), - NonConst("A calendar interval consisting of months, days and nanoseconds."), - MonthDayNanoField, - /*n_in_sequence=*/3}; - -} // namespace - -#ifndef PYPY_VERSION -PyDateTime_CAPI* datetime_api = nullptr; - -void InitDatetime() { - PyAcquireGIL lock; - datetime_api = - reinterpret_cast(PyCapsule_Import(PyDateTime_CAPSULE_NAME, 0)); - if (datetime_api == nullptr) { - Py_FatalError("Could not import datetime C API"); - } -} -#endif - -// The following code is adapted from -// https://github.com/numpy/numpy/blob/main/numpy/core/src/multiarray/datetime.c - -// Days per month, regular year and leap year -static int64_t _days_per_month_table[2][12] = { - {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, - {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; - -static bool is_leapyear(int64_t year) { - return (year & 0x3) == 0 && // year % 4 == 0 - ((year % 100) != 0 || (year % 400) == 0); -} - -// Calculates the days offset from the 1970 epoch. -static int64_t get_days_from_date(int64_t date_year, int64_t date_month, - int64_t date_day) { - int64_t i, month; - int64_t year, days = 0; - int64_t* month_lengths; - - year = date_year - 1970; - days = year * 365; - - // Adjust for leap years - if (days >= 0) { - // 1968 is the closest leap year before 1970. - // Exclude the current year, so add 1. - year += 1; - // Add one day for each 4 years - days += year / 4; - // 1900 is the closest previous year divisible by 100 - year += 68; - // Subtract one day for each 100 years - days -= year / 100; - // 1600 is the closest previous year divisible by 400 - year += 300; - // Add one day for each 400 years - days += year / 400; - } else { - // 1972 is the closest later year after 1970. - // Include the current year, so subtract 2. - year -= 2; - // Subtract one day for each 4 years - days += year / 4; - // 2000 is the closest later year divisible by 100 - year -= 28; - // Add one day for each 100 years - days -= year / 100; - // 2000 is also the closest later year divisible by 400 - // Subtract one day for each 400 years - days += year / 400; - } - - month_lengths = _days_per_month_table[is_leapyear(date_year)]; - month = date_month - 1; - - // Add the months - for (i = 0; i < month; ++i) { - days += month_lengths[i]; - } - - // Add the days - days += date_day - 1; - - return days; -} - -// Modifies '*days_' to be the day offset within the year, -// and returns the year. -static int64_t days_to_yearsdays(int64_t* days_) { - const int64_t days_per_400years = (400 * 365 + 100 - 4 + 1); - // Adjust so it's relative to the year 2000 (divisible by 400) - int64_t days = (*days_) - (365 * 30 + 7); - int64_t year; - - // Break down the 400 year cycle to get the year and day within the year - if (days >= 0) { - year = 400 * (days / days_per_400years); - days = days % days_per_400years; - } else { - year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); - days = days % days_per_400years; - if (days < 0) { - days += days_per_400years; - } - } - - // Work out the year/day within the 400 year cycle - if (days >= 366) { - year += 100 * ((days - 1) / (100 * 365 + 25 - 1)); - days = (days - 1) % (100 * 365 + 25 - 1); - if (days >= 365) { - year += 4 * ((days + 1) / (4 * 365 + 1)); - days = (days + 1) % (4 * 365 + 1); - if (days >= 366) { - year += (days - 1) / 365; - days = (days - 1) % 365; - } - } - } - - *days_ = days; - return year + 2000; -} - -// Extracts the month and year and day number from a number of days -static void get_date_from_days(int64_t days, int64_t* date_year, int64_t* date_month, - int64_t* date_day) { - int64_t *month_lengths, i; - - *date_year = days_to_yearsdays(&days); - month_lengths = _days_per_month_table[is_leapyear(*date_year)]; - - for (i = 0; i < 12; ++i) { - if (days < month_lengths[i]) { - *date_month = i + 1; - *date_day = days + 1; - return; - } else { - days -= month_lengths[i]; - } - } - - // Should never get here - return; -} - -// Splitting time quantities, for example splitting total seconds into -// minutes and remaining seconds. After we run -// int64_t remaining = split_time(total, quotient, &next) -// we have -// total = next * quotient + remaining. Handles negative values by propagating -// them: If total is negative, next will be negative and remaining will -// always be non-negative. -static inline int64_t split_time(int64_t total, int64_t quotient, int64_t* next) { - int64_t r = total % quotient; - if (r < 0) { - *next = total / quotient - 1; - return r + quotient; - } else { - *next = total / quotient; - return r; - } -} - -static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, - int64_t* hour, int64_t* minute, int64_t* second, - int64_t* microsecond) { - switch (unit) { - case TimeUnit::NANO: - if (val % 1000 != 0) { - return Status::Invalid("Value ", val, " has non-zero nanoseconds"); - } - val /= 1000; - // fall through - case TimeUnit::MICRO: - *microsecond = split_time(val, 1000000LL, &val); - *second = split_time(val, 60, &val); - *minute = split_time(val, 60, hour); - break; - case TimeUnit::MILLI: - *microsecond = split_time(val, 1000, &val) * 1000; - // fall through - case TimeUnit::SECOND: - *second = split_time(val, 60, &val); - *minute = split_time(val, 60, hour); - break; - default: - break; - } - return Status::OK(); -} - -static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year, - int64_t* month, int64_t* day) { - switch (unit) { - case DateUnit::MILLI: - val /= 86400000LL; // fall through - case DateUnit::DAY: - get_date_from_days(val, year, month, day); - default: - break; - } - return Status::OK(); -} - -PyObject* NewMonthDayNanoTupleType() { - if (MonthDayNanoTupleType.tp_name == nullptr) { - if (PyStructSequence_InitType2(&MonthDayNanoTupleType, &MonthDayNanoTupleDesc) != 0) { - Py_FatalError("Could not initialize MonthDayNanoTuple"); - } - } - Py_INCREF(&MonthDayNanoTupleType); - return (PyObject*)&MonthDayNanoTupleType; -} - -Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) { - int64_t hour = 0, minute = 0, second = 0, microsecond = 0; - RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); - *out = PyTime_FromTime(static_cast(hour), static_cast(minute), - static_cast(second), static_cast(microsecond)); - return Status::OK(); -} - -Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) { - int64_t year = 0, month = 0, day = 0; - RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day)); - *out = PyDate_FromDate(static_cast(year), static_cast(month), - static_cast(day)); - return Status::OK(); -} - -Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) { - int64_t hour = 0, minute = 0, second = 0, microsecond = 0; - RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); - int64_t total_days = 0; - hour = split_time(hour, 24, &total_days); - int64_t year = 0, month = 0, day = 0; - get_date_from_days(total_days, &year, &month, &day); - *out = PyDateTime_FromDateAndTime( - static_cast(year), static_cast(month), static_cast(day), - static_cast(hour), static_cast(minute), - static_cast(second), static_cast(microsecond)); - return Status::OK(); -} - -int64_t PyDate_to_days(PyDateTime_Date* pydate) { - return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), - PyDateTime_GET_DAY(pydate)); -} - -Result PyDateTime_utcoffset_s(PyObject* obj) { - // calculate offset from UTC timezone in seconds - // supports only PyDateTime_DateTime and PyDateTime_Time objects - OwnedRef pyoffset(PyObject_CallMethod(obj, "utcoffset", NULL)); - RETURN_IF_PYERROR(); - if (pyoffset.obj() != nullptr && pyoffset.obj() != Py_None) { - auto delta = reinterpret_cast(pyoffset.obj()); - return internal::PyDelta_to_s(delta); - } else { - return 0; - } -} - -Result PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo) { - // attempt to convert timezone offset objects to "+/-{hh}:{mm}" format - OwnedRef pydelta_object(PyObject_CallMethod(pytzinfo, "utcoffset", "O", Py_None)); - RETURN_IF_PYERROR(); - - if (!PyDelta_Check(pydelta_object.obj())) { - return Status::Invalid( - "Object returned by tzinfo.utcoffset(None) is not an instance of " - "datetime.timedelta"); - } - auto pydelta = reinterpret_cast(pydelta_object.obj()); - - // retrieve the offset as seconds - auto total_seconds = internal::PyDelta_to_s(pydelta); - - // determine whether the offset is positive or negative - auto sign = (total_seconds < 0) ? "-" : "+"; - total_seconds = abs(total_seconds); - - // calculate offset components - int64_t hours, minutes, seconds; - seconds = split_time(total_seconds, 60, &minutes); - minutes = split_time(minutes, 60, &hours); - if (seconds > 0) { - // check there are no remaining seconds - return Status::Invalid("Offset must represent whole number of minutes"); - } - - // construct the timezone string - std::stringstream stream; - stream << sign << std::setfill('0') << std::setw(2) << hours << ":" << std::setfill('0') - << std::setw(2) << minutes; - return stream.str(); -} - -// Converted from python. See https://github.com/apache/arrow/pull/7604 -// for details. -Result StringToTzinfo(const std::string& tz) { - std::string_view sign_str, hour_str, minute_str; - OwnedRef pytz; - OwnedRef zoneinfo; - OwnedRef datetime; - - if (internal::ImportModule("pytz", &pytz).ok()) { - if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) { - int sign = -1; - if (sign_str == "+") { - sign = 1; - } - OwnedRef fixed_offset; - RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset)); - uint32_t minutes, hours; - if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) || - !::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(), - &minutes)) { - return Status::Invalid("Invalid timezone: ", tz); - } - OwnedRef total_minutes(PyLong_FromLong( - sign * ((static_cast(hours) * 60) + static_cast(minutes)))); - RETURN_IF_PYERROR(); - auto tzinfo = - PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL); - RETURN_IF_PYERROR(); - return tzinfo; - } - - OwnedRef timezone; - RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone)); - OwnedRef py_tz_string( - PyUnicode_FromStringAndSize(tz.c_str(), static_cast(tz.size()))); - auto tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL); - RETURN_IF_PYERROR(); - return tzinfo; - } - - // catch fixed offset if pytz is not present - if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) { - RETURN_NOT_OK(internal::ImportModule("datetime", &datetime)); - int sign = -1; - if (sign_str == "+") { - sign = 1; - } - - // import timezone and timedelta module to create a tzinfo object - OwnedRef class_timezone; - OwnedRef class_timedelta; - RETURN_NOT_OK( - internal::ImportFromModule(datetime.obj(), "timezone", &class_timezone)); - RETURN_NOT_OK( - internal::ImportFromModule(datetime.obj(), "timedelta", &class_timedelta)); - - // check input - uint32_t minutes, hours; - if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) || - !::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(), - &minutes)) { - return Status::Invalid("Invalid timezone: ", tz); - } - - // save offset as a signed integer - OwnedRef total_minutes(PyLong_FromLong( - sign * ((static_cast(hours) * 60) + static_cast(minutes)))); - // create zero integers for empty arguments in datetime.timedelta - OwnedRef zero(PyLong_FromLong(static_cast(0))); - - // call datetime.timedelta to get correct offset object for datetime.timezone - auto offset = - PyObject_CallFunctionObjArgs(class_timedelta.obj(), zero.obj(), zero.obj(), - zero.obj(), zero.obj(), total_minutes.obj(), NULL); - RETURN_IF_PYERROR(); - // call datetime.timezone - auto tzinfo = PyObject_CallFunctionObjArgs(class_timezone.obj(), offset, NULL); - RETURN_IF_PYERROR(); - return tzinfo; - } - - // fallback on zoneinfo if tz is string and pytz is not present - if (internal::ImportModule("zoneinfo", &zoneinfo).ok()) { - OwnedRef class_zoneinfo; - RETURN_NOT_OK( - internal::ImportFromModule(zoneinfo.obj(), "ZoneInfo", &class_zoneinfo)); - OwnedRef py_tz_string( - PyUnicode_FromStringAndSize(tz.c_str(), static_cast(tz.size()))); - auto tzinfo = - PyObject_CallFunctionObjArgs(class_zoneinfo.obj(), py_tz_string.obj(), NULL); - RETURN_IF_PYERROR(); - return tzinfo; - } - - return Status::Invalid( - "Pytz package or Python>=3.8 for zoneinfo module must be installed."); -} - -Result TzinfoToString(PyObject* tzinfo) { - OwnedRef module_pytz; // import pytz - OwnedRef module_datetime; // import datetime - OwnedRef module_zoneinfo; // import zoneinfo - OwnedRef module_dateutil; // import dateutil - OwnedRef class_timezone; // from datetime import timezone - OwnedRef class_fixedoffset; // from pytz import _FixedOffset - OwnedRef class_basetzinfo; // from pytz import BaseTzInfo - OwnedRef class_zoneinfo; // from zoneinfo import ZoneInfo - OwnedRef class_tzfile; // from zoneinfo import tzfile - - // import necessary modules - RETURN_NOT_OK(internal::ImportModule("datetime", &module_datetime)); - // import necessary classes - RETURN_NOT_OK( - internal::ImportFromModule(module_datetime.obj(), "timezone", &class_timezone)); - - // check that it's a valid tzinfo object - if (!PyTZInfo_Check(tzinfo)) { - return Status::TypeError("Not an instance of datetime.tzinfo"); - } - - // if tzinfo is an instance of datetime.timezone return the - // HH:MM offset string representation - if (PyObject_IsInstance(tzinfo, class_timezone.obj())) { - // still recognize datetime.timezone.utc as UTC (instead of +00:00) - OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None)); - RETURN_IF_PYERROR(); - if (PyUnicode_Check(tzname_object.obj())) { - std::string result; - RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result)); - if (result == "UTC") { - return result; - } - } - return PyTZInfo_utcoffset_hhmm(tzinfo); - } - - // Try to import pytz if it is available - if (internal::ImportModule("pytz", &module_pytz).ok()) { - RETURN_NOT_OK(internal::ImportFromModule(module_pytz.obj(), "_FixedOffset", - &class_fixedoffset)); - RETURN_NOT_OK( - internal::ImportFromModule(module_pytz.obj(), "BaseTzInfo", &class_basetzinfo)); - } - - // if tzinfo is an instance of pytz._FixedOffset return the - // HH:MM offset string representation - if (module_pytz.obj() != nullptr && - PyObject_IsInstance(tzinfo, class_fixedoffset.obj())) { - OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None)); - RETURN_IF_PYERROR(); - return PyTZInfo_utcoffset_hhmm(tzinfo); - } - - // if pytz is installed and tzinfo is and instance of pytz.BaseTzInfo - if (module_pytz.obj() != nullptr && - PyObject_IsInstance(tzinfo, class_basetzinfo.obj())) { - OwnedRef zone(PyObject_GetAttrString(tzinfo, "zone")); - RETURN_IF_PYERROR(); - std::string result; - RETURN_NOT_OK(internal::PyUnicode_AsStdString(zone.obj(), &result)); - return result; - } - - // Try to import zoneinfo if it is available - if (internal::ImportModule("zoneinfo", &module_zoneinfo).ok()) { - RETURN_NOT_OK( - internal::ImportFromModule(module_zoneinfo.obj(), "ZoneInfo", &class_zoneinfo)); - } - - // if zoneinfo is installed and tzinfo is an instance of zoneinfo.ZoneInfo - if (module_zoneinfo.obj() != nullptr && - PyObject_IsInstance(tzinfo, class_zoneinfo.obj())) { - OwnedRef key(PyObject_GetAttrString(tzinfo, "key")); - RETURN_IF_PYERROR(); - std::string result; - RETURN_NOT_OK(internal::PyUnicode_AsStdString(key.obj(), &result)); - return result; - } - - // Try to import dateutil if it is available - if (internal::ImportModule("dateutil.tz", &module_dateutil).ok()) { - RETURN_NOT_OK( - internal::ImportFromModule(module_dateutil.obj(), "tzfile", &class_tzfile)); - } - - // if dateutil is installed and tzinfo is an instance of dateutil.tz.tzfile - if (module_dateutil.obj() != nullptr && - PyObject_IsInstance(tzinfo, class_tzfile.obj())) { - OwnedRef _filename(PyObject_GetAttrString(tzinfo, "_filename")); - RETURN_IF_PYERROR(); - std::string result; - RETURN_NOT_OK(internal::PyUnicode_AsStdString(_filename.obj(), &result)); - // _filename returns a full path in general ('/usr/share/zoneinfo/Europe/Paris') - // or POSIX name on Windows ('Europe/Paris') - we need a substring in first case - std::size_t pos = result.find("zoneinfo/"); - if (pos != std::string::npos) { - return result.substr(pos + 9); - } - return result; - } - - // attempt to call tzinfo.tzname(None) - OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None)); - RETURN_IF_PYERROR(); - if (PyUnicode_Check(tzname_object.obj())) { - std::string result; - RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result)); - return result; - } - - // fall back to HH:MM offset string representation based on tzinfo.utcoffset(None) - return PyTZInfo_utcoffset_hhmm(tzinfo); -} - -PyObject* MonthDayNanoIntervalToNamedTuple( - const MonthDayNanoIntervalType::MonthDayNanos& interval) { - OwnedRef tuple(PyStructSequence_New(&MonthDayNanoTupleType)); - if (ARROW_PREDICT_FALSE(tuple.obj() == nullptr)) { - return nullptr; - } - PyStructSequence_SetItem(tuple.obj(), /*pos=*/0, PyLong_FromLong(interval.months)); - PyStructSequence_SetItem(tuple.obj(), /*pos=*/1, PyLong_FromLong(interval.days)); - PyStructSequence_SetItem(tuple.obj(), /*pos=*/2, - PyLong_FromLongLong(interval.nanoseconds)); - return tuple.detach(); -} - -namespace { - -// Wrapper around a Python list object that mimics dereference and assignment -// operations. -struct PyListAssigner { - public: - explicit PyListAssigner(PyObject* list) : list_(list) { DCHECK(PyList_Check(list_)); } - - PyListAssigner& operator*() { return *this; } - - void operator=(PyObject* obj) { - if (ARROW_PREDICT_FALSE(PyList_SetItem(list_, current_index_, obj) == -1)) { - Py_FatalError("list did not have the correct preallocated size."); - } - } - - PyListAssigner& operator++() { - current_index_++; - return *this; - } - - PyListAssigner& operator+=(int64_t offset) { - current_index_ += offset; - return *this; - } - - private: - PyObject* list_; - int64_t current_index_ = 0; -}; - -} // namespace - -Result MonthDayNanoIntervalArrayToPyList( - const MonthDayNanoIntervalArray& array) { - OwnedRef out_list(PyList_New(array.length())); - RETURN_IF_PYERROR(); - PyListAssigner out_objects(out_list.obj()); - auto& interval_array = - arrow::internal::checked_cast(array); - RETURN_NOT_OK(internal::WriteArrayObjects( - interval_array, - [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyListAssigner& out) { - PyObject* tuple = internal::MonthDayNanoIntervalToNamedTuple(interval); - if (ARROW_PREDICT_FALSE(tuple == nullptr)) { - RETURN_IF_PYERROR(); - } - - *out = tuple; - return Status::OK(); - }, - out_objects)); - return out_list.detach(); -} - -Result MonthDayNanoIntervalScalarToPyObject( - const MonthDayNanoIntervalScalar& scalar) { - if (scalar.is_valid) { - return internal::MonthDayNanoIntervalToNamedTuple(scalar.value); - } else { - Py_INCREF(Py_None); - return Py_None; - } -} - -} // namespace internal -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.h b/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.h deleted file mode 100644 index 327a61f..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.h +++ /dev/null @@ -1,231 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/python/platform.h" -#include "arrow/python/visibility.h" -#include "arrow/result.h" -#include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/type_fwd.h" -#include "arrow/util/int_util_overflow.h" -#include "arrow/util/logging.h" - -// By default, PyDateTimeAPI is a *static* variable. This forces -// PyDateTime_IMPORT to be called in every C/C++ module using the -// C datetime API. This is error-prone and potentially costly. -// Instead, we redefine PyDateTimeAPI to point to a global variable, -// which is initialized once by calling InitDatetime(). -#ifdef PYPY_VERSION -#include "datetime.h" -#else -#define PyDateTimeAPI ::arrow::py::internal::datetime_api -#endif - -namespace arrow { -using internal::AddWithOverflow; -using internal::MultiplyWithOverflow; -namespace py { -namespace internal { - -#ifndef PYPY_VERSION -extern PyDateTime_CAPI* datetime_api; - -ARROW_PYTHON_EXPORT -void InitDatetime(); -#endif - -// Returns the MonthDayNano namedtuple type (increments the reference count). -ARROW_PYTHON_EXPORT -PyObject* NewMonthDayNanoTupleType(); - -ARROW_PYTHON_EXPORT -inline int64_t PyTime_to_us(PyObject* pytime) { - return (PyDateTime_TIME_GET_HOUR(pytime) * 3600000000LL + - PyDateTime_TIME_GET_MINUTE(pytime) * 60000000LL + - PyDateTime_TIME_GET_SECOND(pytime) * 1000000LL + - PyDateTime_TIME_GET_MICROSECOND(pytime)); -} - -ARROW_PYTHON_EXPORT -inline int64_t PyTime_to_s(PyObject* pytime) { return PyTime_to_us(pytime) / 1000000; } - -ARROW_PYTHON_EXPORT -inline int64_t PyTime_to_ms(PyObject* pytime) { return PyTime_to_us(pytime) / 1000; } - -ARROW_PYTHON_EXPORT -inline int64_t PyTime_to_ns(PyObject* pytime) { return PyTime_to_us(pytime) * 1000; } - -ARROW_PYTHON_EXPORT -Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out); - -ARROW_PYTHON_EXPORT -Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out); - -// WARNING: This function returns a naive datetime. -ARROW_PYTHON_EXPORT -Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out); - -// This declaration must be the same as in filesystem/filesystem.h -using TimePoint = - std::chrono::time_point; - -ARROW_PYTHON_EXPORT -int64_t PyDate_to_days(PyDateTime_Date* pydate); - -ARROW_PYTHON_EXPORT -inline int64_t PyDate_to_s(PyDateTime_Date* pydate) { - return PyDate_to_days(pydate) * 86400LL; -} - -ARROW_PYTHON_EXPORT -inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { - return PyDate_to_days(pydate) * 86400000LL; -} - -ARROW_PYTHON_EXPORT -inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) { - return (PyDate_to_s(reinterpret_cast(pydatetime)) + - PyDateTime_DATE_GET_HOUR(pydatetime) * 3600LL + - PyDateTime_DATE_GET_MINUTE(pydatetime) * 60LL + - PyDateTime_DATE_GET_SECOND(pydatetime)); -} - -ARROW_PYTHON_EXPORT -inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) { - return (PyDateTime_to_s(pydatetime) * 1000LL + - PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000); -} - -ARROW_PYTHON_EXPORT -inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) { - return (PyDateTime_to_s(pydatetime) * 1000000LL + - PyDateTime_DATE_GET_MICROSECOND(pydatetime)); -} - -ARROW_PYTHON_EXPORT -inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) { - return PyDateTime_to_us(pydatetime) * 1000LL; -} - -ARROW_PYTHON_EXPORT -inline TimePoint PyDateTime_to_TimePoint(PyDateTime_DateTime* pydatetime) { - return TimePoint(TimePoint::duration(PyDateTime_to_ns(pydatetime))); -} - -ARROW_PYTHON_EXPORT -inline int64_t TimePoint_to_ns(TimePoint val) { return val.time_since_epoch().count(); } - -ARROW_PYTHON_EXPORT -inline TimePoint TimePoint_from_s(double val) { - return TimePoint(TimePoint::duration(static_cast(1e9 * val))); -} - -ARROW_PYTHON_EXPORT -inline TimePoint TimePoint_from_ns(int64_t val) { - return TimePoint(TimePoint::duration(val)); -} - -ARROW_PYTHON_EXPORT -inline int64_t PyDelta_to_s(PyDateTime_Delta* pytimedelta) { - return (PyDateTime_DELTA_GET_DAYS(pytimedelta) * 86400LL + - PyDateTime_DELTA_GET_SECONDS(pytimedelta)); -} - -ARROW_PYTHON_EXPORT -inline int64_t PyDelta_to_ms(PyDateTime_Delta* pytimedelta) { - return (PyDelta_to_s(pytimedelta) * 1000LL + - PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta) / 1000); -} - -ARROW_PYTHON_EXPORT -inline Result PyDelta_to_us(PyDateTime_Delta* pytimedelta) { - int64_t result = PyDelta_to_s(pytimedelta); - if (MultiplyWithOverflow(result, 1000000LL, &result)) { - return Status::Invalid("Timedelta too large to fit in 64-bit integer"); - } - if (AddWithOverflow(result, PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta), &result)) { - return Status::Invalid("Timedelta too large to fit in 64-bit integer"); - } - return result; -} - -ARROW_PYTHON_EXPORT -inline Result PyDelta_to_ns(PyDateTime_Delta* pytimedelta) { - ARROW_ASSIGN_OR_RAISE(int64_t result, PyDelta_to_us(pytimedelta)); - if (MultiplyWithOverflow(result, 1000LL, &result)) { - return Status::Invalid("Timedelta too large to fit in 64-bit integer"); - } - return result; -} - -ARROW_PYTHON_EXPORT -Result PyDateTime_utcoffset_s(PyObject* pydatetime); - -/// \brief Convert a time zone name into a time zone object. -/// -/// Supported input strings are: -/// * As used in the Olson time zone database (the "tz database" or -/// "tzdata"), such as "America/New_York" -/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 -/// GIL must be held when calling this method. -ARROW_PYTHON_EXPORT -Result StringToTzinfo(const std::string& tz); - -/// \brief Convert a time zone object to a string representation. -/// -/// The output strings are: -/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 -/// if the input object is either an instance of pytz._FixedOffset or -/// datetime.timedelta -/// * The timezone's name if the input object's tzname() method returns with a -/// non-empty timezone name such as "UTC" or "America/New_York" -/// -/// GIL must be held when calling this method. -ARROW_PYTHON_EXPORT -Result TzinfoToString(PyObject* pytzinfo); - -/// \brief Convert MonthDayNano to a python namedtuple. -/// -/// Return a named tuple (pyarrow.MonthDayNano) containing attributes -/// "months", "days", "nanoseconds" in the given order -/// with values extracted from the fields on interval. -/// -/// GIL must be held when calling this method. -ARROW_PYTHON_EXPORT -PyObject* MonthDayNanoIntervalToNamedTuple( - const MonthDayNanoIntervalType::MonthDayNanos& interval); - -/// \brief Convert the given Array to a PyList object containing -/// pyarrow.MonthDayNano objects. -ARROW_PYTHON_EXPORT -Result MonthDayNanoIntervalArrayToPyList( - const MonthDayNanoIntervalArray& array); - -/// \brief Convert the Scalar obect to a pyarrow.MonthDayNano (or None if -/// is isn't valid). -ARROW_PYTHON_EXPORT -Result MonthDayNanoIntervalScalarToPyObject( - const MonthDayNanoIntervalScalar& scalar); - -} // namespace internal -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/decimal.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/decimal.cc deleted file mode 100644 index 0c00fcf..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/decimal.cc +++ /dev/null @@ -1,246 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "arrow/python/common.h" -#include "arrow/python/decimal.h" -#include "arrow/python/helpers.h" -#include "arrow/type_fwd.h" -#include "arrow/util/decimal.h" -#include "arrow/util/logging.h" - -namespace arrow { -namespace py { -namespace internal { - -Status ImportDecimalType(OwnedRef* decimal_type) { - OwnedRef decimal_module; - RETURN_NOT_OK(ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK(ImportFromModule(decimal_module.obj(), "Decimal", decimal_type)); - return Status::OK(); -} - -Status PythonDecimalToString(PyObject* python_decimal, std::string* out) { - // Call Python's str(decimal_object) - return PyObject_StdStringStr(python_decimal, out); -} - -// \brief Infer the precision and scale of a Python decimal.Decimal instance -// \param python_decimal[in] An instance of decimal.Decimal -// \param precision[out] The value of the inferred precision -// \param scale[out] The value of the inferred scale -// \return The status of the operation -static Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision, - int32_t* scale) { - DCHECK_NE(python_decimal, NULLPTR); - DCHECK_NE(precision, NULLPTR); - DCHECK_NE(scale, NULLPTR); - - // TODO(phillipc): Make sure we perform PyDecimal_Check(python_decimal) as a DCHECK - OwnedRef as_tuple(PyObject_CallMethod(python_decimal, const_cast("as_tuple"), - const_cast(""))); - RETURN_IF_PYERROR(); - DCHECK(PyTuple_Check(as_tuple.obj())); - - OwnedRef digits(PyObject_GetAttrString(as_tuple.obj(), "digits")); - RETURN_IF_PYERROR(); - DCHECK(PyTuple_Check(digits.obj())); - - const auto num_digits = static_cast(PyTuple_Size(digits.obj())); - RETURN_IF_PYERROR(); - - OwnedRef py_exponent(PyObject_GetAttrString(as_tuple.obj(), "exponent")); - RETURN_IF_PYERROR(); - DCHECK(IsPyInteger(py_exponent.obj())); - - const auto exponent = static_cast(PyLong_AsLong(py_exponent.obj())); - RETURN_IF_PYERROR(); - - if (exponent < 0) { - // If exponent > num_digits, we have a number with leading zeros - // such as 0.01234. Ensure we have enough precision for leading zeros - // (which are not included in num_digits). - *precision = std::max(num_digits, -exponent); - *scale = -exponent; - } else { - // Trailing zeros are not included in num_digits, need to add to precision. - // Note we don't generate negative scales as they are poorly supported - // in non-Arrow systems. - *precision = num_digits + exponent; - *scale = 0; - } - return Status::OK(); -} - -PyObject* DecimalFromString(PyObject* decimal_constructor, - const std::string& decimal_string) { - DCHECK_NE(decimal_constructor, nullptr); - - auto string_size = decimal_string.size(); - DCHECK_GT(string_size, 0); - - auto string_bytes = decimal_string.c_str(); - DCHECK_NE(string_bytes, nullptr); - - return PyObject_CallFunction(decimal_constructor, const_cast("s#"), string_bytes, - static_cast(string_size)); -} - -namespace { - -template -Status DecimalFromStdString(const std::string& decimal_string, - const DecimalType& arrow_type, ArrowDecimal* out) { - int32_t inferred_precision; - int32_t inferred_scale; - - RETURN_NOT_OK(ArrowDecimal::FromString(decimal_string, out, &inferred_precision, - &inferred_scale)); - - const int32_t precision = arrow_type.precision(); - const int32_t scale = arrow_type.scale(); - - if (scale != inferred_scale) { - DCHECK_NE(out, NULLPTR); - ARROW_ASSIGN_OR_RAISE(*out, out->Rescale(inferred_scale, scale)); - } - - auto inferred_scale_delta = inferred_scale - scale; - if (ARROW_PREDICT_FALSE((inferred_precision - inferred_scale_delta) > precision)) { - return Status::Invalid( - "Decimal type with precision ", inferred_precision, - " does not fit into precision inferred from first array element: ", precision); - } - - return Status::OK(); -} - -template -Status InternalDecimalFromPythonDecimal(PyObject* python_decimal, - const DecimalType& arrow_type, - ArrowDecimal* out) { - DCHECK_NE(python_decimal, NULLPTR); - DCHECK_NE(out, NULLPTR); - - std::string string; - RETURN_NOT_OK(PythonDecimalToString(python_decimal, &string)); - return DecimalFromStdString(string, arrow_type, out); -} - -template -Status InternalDecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, - ArrowDecimal* out) { - DCHECK_NE(obj, NULLPTR); - DCHECK_NE(out, NULLPTR); - - if (IsPyInteger(obj)) { - // TODO: add a fast path for small-ish ints - std::string string; - RETURN_NOT_OK(PyObject_StdStringStr(obj, &string)); - return DecimalFromStdString(string, arrow_type, out); - } else if (PyDecimal_Check(obj)) { - return InternalDecimalFromPythonDecimal(obj, arrow_type, out); - } else { - return Status::TypeError("int or Decimal object expected, got ", - Py_TYPE(obj)->tp_name); - } -} - -} // namespace - -Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, - Decimal128* out) { - return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out); -} - -Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, - Decimal128* out) { - return InternalDecimalFromPyObject(obj, arrow_type, out); -} - -Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, - Decimal256* out) { - return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out); -} - -Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, - Decimal256* out) { - return InternalDecimalFromPyObject(obj, arrow_type, out); -} - -bool PyDecimal_Check(PyObject* obj) { - static OwnedRef decimal_type; - if (!decimal_type.obj()) { - ARROW_CHECK_OK(ImportDecimalType(&decimal_type)); - DCHECK(PyType_Check(decimal_type.obj())); - } - // PyObject_IsInstance() is slower as it has to check for virtual subclasses - const int result = - PyType_IsSubtype(Py_TYPE(obj), reinterpret_cast(decimal_type.obj())); - ARROW_CHECK_NE(result, -1) << " error during PyType_IsSubtype check"; - return result == 1; -} - -bool PyDecimal_ISNAN(PyObject* obj) { - DCHECK(PyDecimal_Check(obj)) << "obj is not an instance of decimal.Decimal"; - OwnedRef is_nan( - PyObject_CallMethod(obj, const_cast("is_nan"), const_cast(""))); - return PyObject_IsTrue(is_nan.obj()) == 1; -} - -DecimalMetadata::DecimalMetadata() - : DecimalMetadata(std::numeric_limits::min(), - std::numeric_limits::min()) {} - -DecimalMetadata::DecimalMetadata(int32_t precision, int32_t scale) - : precision_(precision), scale_(scale) {} - -Status DecimalMetadata::Update(int32_t suggested_precision, int32_t suggested_scale) { - const int32_t current_scale = scale_; - scale_ = std::max(current_scale, suggested_scale); - - const int32_t current_precision = precision_; - - if (current_precision == std::numeric_limits::min()) { - precision_ = suggested_precision; - } else { - auto num_digits = std::max(current_precision - current_scale, - suggested_precision - suggested_scale); - precision_ = std::max(num_digits + scale_, current_precision); - } - - return Status::OK(); -} - -Status DecimalMetadata::Update(PyObject* object) { - bool is_decimal = PyDecimal_Check(object); - - if (ARROW_PREDICT_FALSE(!is_decimal || PyDecimal_ISNAN(object))) { - return Status::OK(); - } - - int32_t precision = 0; - int32_t scale = 0; - RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale)); - return Update(precision, scale); -} - -} // namespace internal -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/decimal.h b/src/vendored/apache-arrow-12.0.1/arrow/python/decimal.h deleted file mode 100644 index 1187037..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/decimal.h +++ /dev/null @@ -1,128 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/python/visibility.h" -#include "arrow/type.h" - -namespace arrow { - -class Decimal128; -class Decimal256; - -namespace py { - -class OwnedRef; - -// -// Python Decimal support -// - -namespace internal { - -// \brief Import the Python Decimal type -ARROW_PYTHON_EXPORT -Status ImportDecimalType(OwnedRef* decimal_type); - -// \brief Convert a Python Decimal object to a C++ string -// \param[in] python_decimal A Python decimal.Decimal instance -// \param[out] The string representation of the Python Decimal instance -// \return The status of the operation -ARROW_PYTHON_EXPORT -Status PythonDecimalToString(PyObject* python_decimal, std::string* out); - -// \brief Convert a C++ std::string to a Python Decimal instance -// \param[in] decimal_constructor The decimal type object -// \param[in] decimal_string A decimal string -// \return An instance of decimal.Decimal -ARROW_PYTHON_EXPORT -PyObject* DecimalFromString(PyObject* decimal_constructor, - const std::string& decimal_string); - -// \brief Convert a Python decimal to an Arrow Decimal128 object -// \param[in] python_decimal A Python decimal.Decimal instance -// \param[in] arrow_type An instance of arrow::DecimalType -// \param[out] out A pointer to a Decimal128 -// \return The status of the operation -ARROW_PYTHON_EXPORT -Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, - Decimal128* out); - -// \brief Convert a Python object to an Arrow Decimal128 object -// \param[in] python_decimal A Python int or decimal.Decimal instance -// \param[in] arrow_type An instance of arrow::DecimalType -// \param[out] out A pointer to a Decimal128 -// \return The status of the operation -ARROW_PYTHON_EXPORT -Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal128* out); - -// \brief Convert a Python decimal to an Arrow Decimal256 object -// \param[in] python_decimal A Python decimal.Decimal instance -// \param[in] arrow_type An instance of arrow::DecimalType -// \param[out] out A pointer to a Decimal256 -// \return The status of the operation -ARROW_PYTHON_EXPORT -Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, - Decimal256* out); - -// \brief Convert a Python object to an Arrow Decimal256 object -// \param[in] python_decimal A Python int or decimal.Decimal instance -// \param[in] arrow_type An instance of arrow::DecimalType -// \param[out] out A pointer to a Decimal256 -// \return The status of the operation -ARROW_PYTHON_EXPORT -Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal256* out); - -// \brief Check whether obj is an instance of Decimal -ARROW_PYTHON_EXPORT -bool PyDecimal_Check(PyObject* obj); - -// \brief Check whether obj is nan. This function will abort the program if the argument -// is not a Decimal instance -ARROW_PYTHON_EXPORT -bool PyDecimal_ISNAN(PyObject* obj); - -// \brief Helper class to track and update the precision and scale of a decimal -class ARROW_PYTHON_EXPORT DecimalMetadata { - public: - DecimalMetadata(); - DecimalMetadata(int32_t precision, int32_t scale); - - // \brief Adjust the precision and scale of a decimal type given a new precision and a - // new scale \param[in] suggested_precision A candidate precision \param[in] - // suggested_scale A candidate scale \return The status of the operation - Status Update(int32_t suggested_precision, int32_t suggested_scale); - - // \brief A convenient interface for updating the precision and scale based on a Python - // Decimal object \param object A Python Decimal object \return The status of the - // operation - Status Update(PyObject* object); - - int32_t precision() const { return precision_; } - int32_t scale() const { return scale_; } - - private: - int32_t precision_; - int32_t scale_; -}; - -} // namespace internal -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.cc deleted file mode 100644 index 961a168..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.cc +++ /dev/null @@ -1,495 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/deserialize.h" - -#include "arrow/python/numpy_interop.h" - -#include -#include -#include -#include -#include - -#include -#include - -#include "arrow/array.h" -#include "arrow/io/interfaces.h" -#include "arrow/io/memory.h" -#include "arrow/ipc/options.h" -#include "arrow/ipc/reader.h" -#include "arrow/ipc/util.h" -#include "arrow/ipc/writer.h" -#include "arrow/table.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/logging.h" -#include "arrow/util/value_parsing.h" - -#include "arrow/python/common.h" -#include "arrow/python/datetime.h" -#include "arrow/python/helpers.h" -#include "arrow/python/numpy_convert.h" -#include "arrow/python/pyarrow.h" -#include "arrow/python/serialize.h" - -namespace arrow { - -using internal::checked_cast; -using internal::ParseValue; - -namespace py { - -Status CallDeserializeCallback(PyObject* context, PyObject* value, - PyObject** deserialized_object); - -Status DeserializeTuple(PyObject* context, const Array& array, int64_t start_idx, - int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, - PyObject** out); - -Status DeserializeList(PyObject* context, const Array& array, int64_t start_idx, - int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, - PyObject** out); - -Status DeserializeSet(PyObject* context, const Array& array, int64_t start_idx, - int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, - PyObject** out); - -Status DeserializeDict(PyObject* context, const Array& array, int64_t start_idx, - int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, - PyObject** out) { - const auto& data = checked_cast(array); - OwnedRef keys, vals; - OwnedRef result(PyDict_New()); - RETURN_IF_PYERROR(); - - DCHECK_EQ(2, data.num_fields()); - - RETURN_NOT_OK(DeserializeList(context, *data.field(0), start_idx, stop_idx, base, blobs, - keys.ref())); - RETURN_NOT_OK(DeserializeList(context, *data.field(1), start_idx, stop_idx, base, blobs, - vals.ref())); - for (int64_t i = start_idx; i < stop_idx; ++i) { - // PyDict_SetItem behaves differently from PyList_SetItem and PyTuple_SetItem. - // The latter two steal references whereas PyDict_SetItem does not. So we need - // to make sure the reference count is decremented by letting the OwnedRef - // go out of scope at the end. - int ret = PyDict_SetItem(result.obj(), PyList_GET_ITEM(keys.obj(), i - start_idx), - PyList_GET_ITEM(vals.obj(), i - start_idx)); - if (ret != 0) { - return ConvertPyError(); - } - } - static PyObject* py_type = PyUnicode_FromString("_pytype_"); - if (PyDict_Contains(result.obj(), py_type)) { - RETURN_NOT_OK(CallDeserializeCallback(context, result.obj(), out)); - } else { - *out = result.detach(); - } - return Status::OK(); -} - -Status DeserializeArray(int32_t index, PyObject* base, const SerializedPyObject& blobs, - PyObject** out) { - RETURN_NOT_OK(py::TensorToNdarray(blobs.ndarrays[index], base, out)); - // Mark the array as immutable - OwnedRef flags(PyObject_GetAttrString(*out, "flags")); - if (flags.obj() == NULL) { - return ConvertPyError(); - } - if (PyObject_SetAttrString(flags.obj(), "writeable", Py_False) < 0) { - return ConvertPyError(); - } - return Status::OK(); -} - -Status GetValue(PyObject* context, const Array& arr, int64_t index, int8_t type, - PyObject* base, const SerializedPyObject& blobs, PyObject** result) { - switch (type) { - case PythonType::NONE: - Py_INCREF(Py_None); - *result = Py_None; - return Status::OK(); - case PythonType::BOOL: - *result = PyBool_FromLong(checked_cast(arr).Value(index)); - return Status::OK(); - case PythonType::PY2INT: - case PythonType::INT: { - *result = PyLong_FromSsize_t(checked_cast(arr).Value(index)); - return Status::OK(); - } - case PythonType::BYTES: { - auto view = checked_cast(arr).GetView(index); - *result = PyBytes_FromStringAndSize(view.data(), view.length()); - return CheckPyError(); - } - case PythonType::STRING: { - auto view = checked_cast(arr).GetView(index); - *result = PyUnicode_FromStringAndSize(view.data(), view.length()); - return CheckPyError(); - } - case PythonType::HALF_FLOAT: { - *result = PyHalf_FromHalf(checked_cast(arr).Value(index)); - RETURN_IF_PYERROR(); - return Status::OK(); - } - case PythonType::FLOAT: - *result = PyFloat_FromDouble(checked_cast(arr).Value(index)); - return Status::OK(); - case PythonType::DOUBLE: - *result = PyFloat_FromDouble(checked_cast(arr).Value(index)); - return Status::OK(); - case PythonType::DATE64: { - RETURN_NOT_OK(internal::PyDateTime_from_int( - checked_cast(arr).Value(index), TimeUnit::MICRO, result)); - RETURN_IF_PYERROR(); - return Status::OK(); - } - case PythonType::LIST: { - const auto& l = checked_cast(arr); - return DeserializeList(context, *l.values(), l.value_offset(index), - l.value_offset(index + 1), base, blobs, result); - } - case PythonType::DICT: { - const auto& l = checked_cast(arr); - return DeserializeDict(context, *l.values(), l.value_offset(index), - l.value_offset(index + 1), base, blobs, result); - } - case PythonType::TUPLE: { - const auto& l = checked_cast(arr); - return DeserializeTuple(context, *l.values(), l.value_offset(index), - l.value_offset(index + 1), base, blobs, result); - } - case PythonType::SET: { - const auto& l = checked_cast(arr); - return DeserializeSet(context, *l.values(), l.value_offset(index), - l.value_offset(index + 1), base, blobs, result); - } - case PythonType::TENSOR: { - int32_t ref = checked_cast(arr).Value(index); - *result = wrap_tensor(blobs.tensors[ref]); - return Status::OK(); - } - case PythonType::SPARSECOOTENSOR: { - int32_t ref = checked_cast(arr).Value(index); - const std::shared_ptr& sparse_coo_tensor = - arrow::internal::checked_pointer_cast( - blobs.sparse_tensors[ref]); - *result = wrap_sparse_coo_tensor(sparse_coo_tensor); - return Status::OK(); - } - case PythonType::SPARSECSRMATRIX: { - int32_t ref = checked_cast(arr).Value(index); - const std::shared_ptr& sparse_csr_matrix = - arrow::internal::checked_pointer_cast( - blobs.sparse_tensors[ref]); - *result = wrap_sparse_csr_matrix(sparse_csr_matrix); - return Status::OK(); - } - case PythonType::SPARSECSCMATRIX: { - int32_t ref = checked_cast(arr).Value(index); - const std::shared_ptr& sparse_csc_matrix = - arrow::internal::checked_pointer_cast( - blobs.sparse_tensors[ref]); - *result = wrap_sparse_csc_matrix(sparse_csc_matrix); - return Status::OK(); - } - case PythonType::SPARSECSFTENSOR: { - int32_t ref = checked_cast(arr).Value(index); - const std::shared_ptr& sparse_csf_tensor = - arrow::internal::checked_pointer_cast( - blobs.sparse_tensors[ref]); - *result = wrap_sparse_csf_tensor(sparse_csf_tensor); - return Status::OK(); - } - case PythonType::NDARRAY: { - int32_t ref = checked_cast(arr).Value(index); - return DeserializeArray(ref, base, blobs, result); - } - case PythonType::BUFFER: { - int32_t ref = checked_cast(arr).Value(index); - *result = wrap_buffer(blobs.buffers[ref]); - return Status::OK(); - } - default: { - ARROW_CHECK(false) << "union tag " << type << "' not recognized"; - } - } - return Status::OK(); -} - -Status GetPythonTypes(const UnionArray& data, std::vector* result) { - ARROW_CHECK(result != nullptr); - auto type = data.type(); - for (int i = 0; i < type->num_fields(); ++i) { - int8_t tag = 0; - const std::string& data = type->field(i)->name(); - if (!ParseValue(data.c_str(), data.size(), &tag)) { - return Status::SerializationError("Cannot convert string: \"", - type->field(i)->name(), "\" to int8_t"); - } - result->push_back(tag); - } - return Status::OK(); -} - -template -Status DeserializeSequence(PyObject* context, const Array& array, int64_t start_idx, - int64_t stop_idx, PyObject* base, - const SerializedPyObject& blobs, - CreateSequenceFn&& create_sequence, SetItemFn&& set_item, - PyObject** out) { - const auto& data = checked_cast(array); - OwnedRef result(create_sequence(stop_idx - start_idx)); - RETURN_IF_PYERROR(); - const int8_t* type_codes = data.raw_type_codes(); - const int32_t* value_offsets = data.raw_value_offsets(); - std::vector python_types; - RETURN_NOT_OK(GetPythonTypes(data, &python_types)); - for (int64_t i = start_idx; i < stop_idx; ++i) { - const int64_t offset = value_offsets[i]; - const uint8_t type = type_codes[i]; - PyObject* value; - RETURN_NOT_OK(GetValue(context, *data.field(type), offset, python_types[type], base, - blobs, &value)); - RETURN_NOT_OK(set_item(result.obj(), i - start_idx, value)); - } - *out = result.detach(); - return Status::OK(); -} - -Status DeserializeList(PyObject* context, const Array& array, int64_t start_idx, - int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, - PyObject** out) { - return DeserializeSequence( - context, array, start_idx, stop_idx, base, blobs, - [](int64_t size) { return PyList_New(size); }, - [](PyObject* seq, int64_t index, PyObject* item) { - PyList_SET_ITEM(seq, index, item); - return Status::OK(); - }, - out); -} - -Status DeserializeTuple(PyObject* context, const Array& array, int64_t start_idx, - int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, - PyObject** out) { - return DeserializeSequence( - context, array, start_idx, stop_idx, base, blobs, - [](int64_t size) { return PyTuple_New(size); }, - [](PyObject* seq, int64_t index, PyObject* item) { - PyTuple_SET_ITEM(seq, index, item); - return Status::OK(); - }, - out); -} - -Status DeserializeSet(PyObject* context, const Array& array, int64_t start_idx, - int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, - PyObject** out) { - return DeserializeSequence( - context, array, start_idx, stop_idx, base, blobs, - [](int64_t size) { return PySet_New(nullptr); }, - [](PyObject* seq, int64_t index, PyObject* item) { - int err = PySet_Add(seq, item); - Py_DECREF(item); - if (err < 0) { - RETURN_IF_PYERROR(); - } - return Status::OK(); - }, - out); -} - -Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out) { - int32_t num_tensors; - int32_t num_sparse_tensors; - int32_t num_ndarrays; - int32_t num_buffers; - - // Read number of tensors - RETURN_NOT_OK(src->Read(sizeof(int32_t), reinterpret_cast(&num_tensors))); - RETURN_NOT_OK( - src->Read(sizeof(int32_t), reinterpret_cast(&num_sparse_tensors))); - RETURN_NOT_OK(src->Read(sizeof(int32_t), reinterpret_cast(&num_ndarrays))); - RETURN_NOT_OK(src->Read(sizeof(int32_t), reinterpret_cast(&num_buffers))); - - // Align stream to 8-byte offset - RETURN_NOT_OK(ipc::AlignStream(src, ipc::kArrowIpcAlignment)); - std::shared_ptr reader; - ARROW_ASSIGN_OR_RAISE(reader, ipc::RecordBatchStreamReader::Open(src)); - RETURN_NOT_OK(reader->ReadNext(&out->batch)); - - /// Skip EOS marker - RETURN_NOT_OK(src->Advance(4)); - - /// Align stream so tensor bodies are 64-byte aligned - RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment)); - - for (int i = 0; i < num_tensors; ++i) { - std::shared_ptr tensor; - ARROW_ASSIGN_OR_RAISE(tensor, ipc::ReadTensor(src)); - RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment)); - out->tensors.push_back(tensor); - } - - for (int i = 0; i < num_sparse_tensors; ++i) { - std::shared_ptr sparse_tensor; - ARROW_ASSIGN_OR_RAISE(sparse_tensor, ipc::ReadSparseTensor(src)); - RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment)); - out->sparse_tensors.push_back(sparse_tensor); - } - - for (int i = 0; i < num_ndarrays; ++i) { - std::shared_ptr ndarray; - ARROW_ASSIGN_OR_RAISE(ndarray, ipc::ReadTensor(src)); - RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment)); - out->ndarrays.push_back(ndarray); - } - - ARROW_ASSIGN_OR_RAISE(int64_t offset, src->Tell()); - for (int i = 0; i < num_buffers; ++i) { - int64_t size; - RETURN_NOT_OK(src->ReadAt(offset, sizeof(int64_t), &size)); - offset += sizeof(int64_t); - ARROW_ASSIGN_OR_RAISE(auto buffer, src->ReadAt(offset, size)); - out->buffers.push_back(buffer); - offset += size; - } - - return Status::OK(); -} - -Status DeserializeObject(PyObject* context, const SerializedPyObject& obj, PyObject* base, - PyObject** out) { - PyAcquireGIL lock; - return DeserializeList(context, *obj.batch->column(0), 0, obj.batch->num_rows(), base, - obj, out); -} - -Status GetSerializedFromComponents(int num_tensors, - const SparseTensorCounts& num_sparse_tensors, - int num_ndarrays, int num_buffers, PyObject* data, - SerializedPyObject* out) { - PyAcquireGIL gil; - const Py_ssize_t data_length = PyList_Size(data); - RETURN_IF_PYERROR(); - - const Py_ssize_t expected_data_length = 1 + num_tensors * 2 + - num_sparse_tensors.num_total_buffers() + - num_ndarrays * 2 + num_buffers; - if (data_length != expected_data_length) { - return Status::Invalid("Invalid number of buffers in data"); - } - - auto GetBuffer = [&data](Py_ssize_t index, std::shared_ptr* out) { - ARROW_CHECK_LE(index, PyList_Size(data)); - PyObject* py_buf = PyList_GET_ITEM(data, index); - return unwrap_buffer(py_buf).Value(out); - }; - - Py_ssize_t buffer_index = 0; - - // Read the union batch describing object structure - { - std::shared_ptr data_buffer; - RETURN_NOT_OK(GetBuffer(buffer_index++, &data_buffer)); - gil.release(); - io::BufferReader buf_reader(data_buffer); - std::shared_ptr reader; - ARROW_ASSIGN_OR_RAISE(reader, ipc::RecordBatchStreamReader::Open(&buf_reader)); - RETURN_NOT_OK(reader->ReadNext(&out->batch)); - gil.acquire(); - } - - // Zero-copy reconstruct tensors - for (int i = 0; i < num_tensors; ++i) { - std::shared_ptr metadata; - std::shared_ptr body; - std::shared_ptr tensor; - RETURN_NOT_OK(GetBuffer(buffer_index++, &metadata)); - RETURN_NOT_OK(GetBuffer(buffer_index++, &body)); - - ipc::Message message(metadata, body); - - ARROW_ASSIGN_OR_RAISE(tensor, ipc::ReadTensor(message)); - out->tensors.emplace_back(std::move(tensor)); - } - - // Zero-copy reconstruct sparse tensors - for (int i = 0, n = num_sparse_tensors.num_total_tensors(); i < n; ++i) { - ipc::IpcPayload payload; - RETURN_NOT_OK(GetBuffer(buffer_index++, &payload.metadata)); - - ARROW_ASSIGN_OR_RAISE( - size_t num_bodies, - ipc::internal::ReadSparseTensorBodyBufferCount(*payload.metadata)); - - payload.body_buffers.reserve(num_bodies); - for (size_t i = 0; i < num_bodies; ++i) { - std::shared_ptr body; - RETURN_NOT_OK(GetBuffer(buffer_index++, &body)); - payload.body_buffers.emplace_back(body); - } - - std::shared_ptr sparse_tensor; - ARROW_ASSIGN_OR_RAISE(sparse_tensor, ipc::internal::ReadSparseTensorPayload(payload)); - out->sparse_tensors.emplace_back(std::move(sparse_tensor)); - } - - // Zero-copy reconstruct tensors for numpy ndarrays - for (int i = 0; i < num_ndarrays; ++i) { - std::shared_ptr metadata; - std::shared_ptr body; - std::shared_ptr tensor; - RETURN_NOT_OK(GetBuffer(buffer_index++, &metadata)); - RETURN_NOT_OK(GetBuffer(buffer_index++, &body)); - - ipc::Message message(metadata, body); - - ARROW_ASSIGN_OR_RAISE(tensor, ipc::ReadTensor(message)); - out->ndarrays.emplace_back(std::move(tensor)); - } - - // Unwrap and append buffers - for (int i = 0; i < num_buffers; ++i) { - std::shared_ptr buffer; - RETURN_NOT_OK(GetBuffer(buffer_index++, &buffer)); - out->buffers.emplace_back(std::move(buffer)); - } - - return Status::OK(); -} - -Status DeserializeNdarray(const SerializedPyObject& object, - std::shared_ptr* out) { - if (object.ndarrays.size() != 1) { - return Status::Invalid("Object is not an Ndarray"); - } - *out = object.ndarrays[0]; - return Status::OK(); -} - -Status NdarrayFromBuffer(std::shared_ptr src, std::shared_ptr* out) { - io::BufferReader reader(src); - SerializedPyObject object; - RETURN_NOT_OK(ReadSerializedObject(&reader, &object)); - return DeserializeNdarray(object, out); -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.h b/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.h deleted file mode 100644 index 41b6a13..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.h +++ /dev/null @@ -1,106 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/python/serialize.h" -#include "arrow/python/visibility.h" -#include "arrow/status.h" - -namespace arrow { - -class RecordBatch; -class Tensor; - -namespace io { - -class RandomAccessFile; - -} // namespace io - -namespace py { - -struct ARROW_PYTHON_EXPORT SparseTensorCounts { - int coo; - int csr; - int csc; - int csf; - int ndim_csf; - - int num_total_tensors() const { return coo + csr + csc + csf; } - int num_total_buffers() const { - return coo * 3 + csr * 4 + csc * 4 + 2 * ndim_csf + csf; - } -}; - -/// \brief Read serialized Python sequence from file interface using Arrow IPC -/// \param[in] src a RandomAccessFile -/// \param[out] out the reconstructed data -/// \return Status -ARROW_PYTHON_EXPORT -Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out); - -/// \brief Reconstruct SerializedPyObject from representation produced by -/// SerializedPyObject::GetComponents. -/// -/// \param[in] num_tensors number of tensors in the object -/// \param[in] num_sparse_tensors number of sparse tensors in the object -/// \param[in] num_ndarrays number of numpy Ndarrays in the object -/// \param[in] num_buffers number of buffers in the object -/// \param[in] data a list containing pyarrow.Buffer instances. It must be 1 + -/// num_tensors * 2 + num_coo_tensors * 3 + num_csr_tensors * 4 + num_csc_tensors * 4 + -/// num_csf_tensors * (2 * ndim_csf + 3) + num_buffers in length -/// \param[out] out the reconstructed object -/// \return Status -ARROW_PYTHON_EXPORT -Status GetSerializedFromComponents(int num_tensors, - const SparseTensorCounts& num_sparse_tensors, - int num_ndarrays, int num_buffers, PyObject* data, - SerializedPyObject* out); - -/// \brief Reconstruct Python object from Arrow-serialized representation -/// \param[in] context Serialization context which contains custom serialization -/// and deserialization callbacks. Can be any Python object with a -/// _serialize_callback method for serialization and a _deserialize_callback -/// method for deserialization. If context is None, no custom serialization -/// will be attempted. -/// \param[in] object Object to deserialize -/// \param[in] base a Python object holding the underlying data that any NumPy -/// arrays will reference, to avoid premature deallocation -/// \param[out] out The returned object -/// \return Status -/// This acquires the GIL -ARROW_PYTHON_EXPORT -Status DeserializeObject(PyObject* context, const SerializedPyObject& object, - PyObject* base, PyObject** out); - -/// \brief Reconstruct Ndarray from Arrow-serialized representation -/// \param[in] object Object to deserialize -/// \param[out] out The deserialized tensor -/// \return Status -ARROW_PYTHON_EXPORT -Status DeserializeNdarray(const SerializedPyObject& object, std::shared_ptr* out); - -ARROW_PYTHON_EXPORT -Status NdarrayFromBuffer(std::shared_ptr src, std::shared_ptr* out); - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.cc deleted file mode 100644 index 3ccc171..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.cc +++ /dev/null @@ -1,217 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include - -#include "arrow/python/extension_type.h" -#include "arrow/python/helpers.h" -#include "arrow/python/pyarrow.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/logging.h" - -namespace arrow { - -using internal::checked_cast; - -namespace py { - -namespace { - -// Serialize a Python ExtensionType instance -Status SerializeExtInstance(PyObject* type_instance, std::string* out) { - OwnedRef res( - cpp_PyObject_CallMethod(type_instance, "__arrow_ext_serialize__", nullptr)); - if (!res) { - return ConvertPyError(); - } - if (!PyBytes_Check(res.obj())) { - return Status::TypeError( - "__arrow_ext_serialize__ should return bytes object, " - "got ", - internal::PyObject_StdStringRepr(res.obj())); - } - *out = internal::PyBytes_AsStdString(res.obj()); - return Status::OK(); -} - -// Deserialize a Python ExtensionType instance -PyObject* DeserializeExtInstance(PyObject* type_class, - std::shared_ptr storage_type, - const std::string& serialized_data) { - OwnedRef storage_ref(wrap_data_type(storage_type)); - if (!storage_ref) { - return nullptr; - } - OwnedRef data_ref(PyBytes_FromStringAndSize( - serialized_data.data(), static_cast(serialized_data.size()))); - if (!data_ref) { - return nullptr; - } - - return cpp_PyObject_CallMethod(type_class, "__arrow_ext_deserialize__", "OO", - storage_ref.obj(), data_ref.obj()); -} - -} // namespace - -static const char* kExtensionName = "arrow.py_extension_type"; - -std::string PyExtensionType::ToString() const { - PyAcquireGIL lock; - - std::stringstream ss; - OwnedRef instance(GetInstance()); - ss << "extension<" << this->extension_name() << "<" << Py_TYPE(instance.obj())->tp_name - << ">>"; - return ss.str(); -} - -PyExtensionType::PyExtensionType(std::shared_ptr storage_type, PyObject* typ, - PyObject* inst) - : ExtensionType(storage_type), - extension_name_(kExtensionName), - type_class_(typ), - type_instance_(inst) {} - -PyExtensionType::PyExtensionType(std::shared_ptr storage_type, - std::string extension_name, PyObject* typ, - PyObject* inst) - : ExtensionType(storage_type), - extension_name_(std::move(extension_name)), - type_class_(typ), - type_instance_(inst) {} - -bool PyExtensionType::ExtensionEquals(const ExtensionType& other) const { - PyAcquireGIL lock; - - if (other.extension_name() != extension_name()) { - return false; - } - const auto& other_ext = checked_cast(other); - int res = -1; - if (!type_instance_) { - if (other_ext.type_instance_) { - return false; - } - // Compare Python types - res = PyObject_RichCompareBool(type_class_.obj(), other_ext.type_class_.obj(), Py_EQ); - } else { - if (!other_ext.type_instance_) { - return false; - } - // Compare Python instances - OwnedRef left(GetInstance()); - OwnedRef right(other_ext.GetInstance()); - if (!left || !right) { - goto error; - } - res = PyObject_RichCompareBool(left.obj(), right.obj(), Py_EQ); - } - if (res == -1) { - goto error; - } - return res == 1; - -error: - // Cannot propagate error - PyErr_WriteUnraisable(nullptr); - return false; -} - -std::shared_ptr PyExtensionType::MakeArray(std::shared_ptr data) const { - DCHECK_EQ(data->type->id(), Type::EXTENSION); - return std::make_shared(data); -} - -std::string PyExtensionType::Serialize() const { - DCHECK(type_instance_); - return serialized_; -} - -Result> PyExtensionType::Deserialize( - std::shared_ptr storage_type, const std::string& serialized_data) const { - PyAcquireGIL lock; - - if (import_pyarrow()) { - return ConvertPyError(); - } - OwnedRef res(DeserializeExtInstance(type_class_.obj(), storage_type, serialized_data)); - if (!res) { - return ConvertPyError(); - } - return unwrap_data_type(res.obj()); -} - -PyObject* PyExtensionType::GetInstance() const { - if (!type_instance_) { - PyErr_SetString(PyExc_TypeError, "Not an instance"); - return nullptr; - } - DCHECK(PyWeakref_CheckRef(type_instance_.obj())); - PyObject* inst = PyWeakref_GET_OBJECT(type_instance_.obj()); - if (inst != Py_None) { - // Cached instance still alive - Py_INCREF(inst); - return inst; - } else { - // Must reconstruct from serialized form - // XXX cache again? - return DeserializeExtInstance(type_class_.obj(), storage_type_, serialized_); - } -} - -Status PyExtensionType::SetInstance(PyObject* inst) const { - // Check we have the right type - PyObject* typ = reinterpret_cast(Py_TYPE(inst)); - if (typ != type_class_.obj()) { - return Status::TypeError("Unexpected Python ExtensionType class ", - internal::PyObject_StdStringRepr(typ), " expected ", - internal::PyObject_StdStringRepr(type_class_.obj())); - } - - PyObject* wr = PyWeakref_NewRef(inst, nullptr); - if (wr == NULL) { - return ConvertPyError(); - } - type_instance_.reset(wr); - return SerializeExtInstance(inst, &serialized_); -} - -Status PyExtensionType::FromClass(const std::shared_ptr storage_type, - const std::string extension_name, PyObject* typ, - std::shared_ptr* out) { - Py_INCREF(typ); - out->reset(new PyExtensionType(storage_type, std::move(extension_name), typ)); - return Status::OK(); -} - -Status RegisterPyExtensionType(const std::shared_ptr& type) { - DCHECK_EQ(type->id(), Type::EXTENSION); - auto ext_type = std::dynamic_pointer_cast(type); - return RegisterExtensionType(ext_type); -} - -Status UnregisterPyExtensionType(const std::string& type_name) { - return UnregisterExtensionType(type_name); -} - -std::string PyExtensionName() { return kExtensionName; } - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.h b/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.h deleted file mode 100644 index e433d9a..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.h +++ /dev/null @@ -1,85 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/extension_type.h" -#include "arrow/python/common.h" -#include "arrow/python/visibility.h" -#include "arrow/util/macros.h" - -namespace arrow { -namespace py { - -class ARROW_PYTHON_EXPORT PyExtensionType : public ExtensionType { - public: - // Implement extensionType API - std::string extension_name() const override { return extension_name_; } - - std::string ToString() const override; - - bool ExtensionEquals(const ExtensionType& other) const override; - - std::shared_ptr MakeArray(std::shared_ptr data) const override; - - Result> Deserialize( - std::shared_ptr storage_type, - const std::string& serialized) const override; - - std::string Serialize() const override; - - // For use from Cython - // Assumes that `typ` is borrowed - static Status FromClass(const std::shared_ptr storage_type, - const std::string extension_name, PyObject* typ, - std::shared_ptr* out); - - // Return new ref - PyObject* GetInstance() const; - Status SetInstance(PyObject*) const; - - protected: - PyExtensionType(std::shared_ptr storage_type, PyObject* typ, - PyObject* inst = NULLPTR); - PyExtensionType(std::shared_ptr storage_type, std::string extension_name, - PyObject* typ, PyObject* inst = NULLPTR); - - std::string extension_name_; - - // These fields are mutable because of two-step initialization. - mutable OwnedRefNoGIL type_class_; - // A weakref or null. Storing a strong reference to the Python extension type - // instance would create an unreclaimable reference cycle between Python and C++ - // (the Python instance has to keep a strong reference to the C++ ExtensionType - // in other direction). Instead, we store a weakref to the instance. - // If the weakref is dead, we reconstruct the instance from its serialized form. - mutable OwnedRefNoGIL type_instance_; - // Empty if type_instance_ is null - mutable std::string serialized_; -}; - -ARROW_PYTHON_EXPORT std::string PyExtensionName(); - -ARROW_PYTHON_EXPORT Status RegisterPyExtensionType(const std::shared_ptr&); - -ARROW_PYTHON_EXPORT Status UnregisterPyExtensionType(const std::string& type_name); - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.cc deleted file mode 100644 index 5e9b500..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.cc +++ /dev/null @@ -1,206 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/filesystem.h" -#include "arrow/util/logging.h" - -namespace arrow { - -using fs::FileInfo; -using fs::FileSelector; - -namespace py { -namespace fs { - -PyFileSystem::PyFileSystem(PyObject* handler, PyFileSystemVtable vtable) - : handler_(handler), vtable_(std::move(vtable)) { - Py_INCREF(handler); -} - -PyFileSystem::~PyFileSystem() {} - -std::shared_ptr PyFileSystem::Make(PyObject* handler, - PyFileSystemVtable vtable) { - return std::make_shared(handler, std::move(vtable)); -} - -std::string PyFileSystem::type_name() const { - std::string result; - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.get_type_name(handler_.obj(), &result); - if (PyErr_Occurred()) { - PyErr_WriteUnraisable(handler_.obj()); - } - return Status::OK(); - }); - ARROW_UNUSED(st); - return result; -} - -bool PyFileSystem::Equals(const FileSystem& other) const { - bool result; - auto st = SafeCallIntoPython([&]() -> Status { - result = vtable_.equals(handler_.obj(), other); - if (PyErr_Occurred()) { - PyErr_WriteUnraisable(handler_.obj()); - } - return Status::OK(); - }); - ARROW_UNUSED(st); - return result; -} - -Result PyFileSystem::GetFileInfo(const std::string& path) { - FileInfo info; - - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.get_file_info(handler_.obj(), path, &info); - return CheckPyError(); - }); - RETURN_NOT_OK(st); - return info; -} - -Result> PyFileSystem::GetFileInfo( - const std::vector& paths) { - std::vector infos; - - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.get_file_info_vector(handler_.obj(), paths, &infos); - return CheckPyError(); - }); - RETURN_NOT_OK(st); - return infos; -} - -Result> PyFileSystem::GetFileInfo(const FileSelector& select) { - std::vector infos; - - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.get_file_info_selector(handler_.obj(), select, &infos); - return CheckPyError(); - }); - RETURN_NOT_OK(st); - return infos; -} - -Status PyFileSystem::CreateDir(const std::string& path, bool recursive) { - return SafeCallIntoPython([&]() -> Status { - vtable_.create_dir(handler_.obj(), path, recursive); - return CheckPyError(); - }); -} - -Status PyFileSystem::DeleteDir(const std::string& path) { - return SafeCallIntoPython([&]() -> Status { - vtable_.delete_dir(handler_.obj(), path); - return CheckPyError(); - }); -} - -Status PyFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) { - return SafeCallIntoPython([&]() -> Status { - vtable_.delete_dir_contents(handler_.obj(), path, missing_dir_ok); - return CheckPyError(); - }); -} - -Status PyFileSystem::DeleteRootDirContents() { - return SafeCallIntoPython([&]() -> Status { - vtable_.delete_root_dir_contents(handler_.obj()); - return CheckPyError(); - }); -} - -Status PyFileSystem::DeleteFile(const std::string& path) { - return SafeCallIntoPython([&]() -> Status { - vtable_.delete_file(handler_.obj(), path); - return CheckPyError(); - }); -} - -Status PyFileSystem::Move(const std::string& src, const std::string& dest) { - return SafeCallIntoPython([&]() -> Status { - vtable_.move(handler_.obj(), src, dest); - return CheckPyError(); - }); -} - -Status PyFileSystem::CopyFile(const std::string& src, const std::string& dest) { - return SafeCallIntoPython([&]() -> Status { - vtable_.copy_file(handler_.obj(), src, dest); - return CheckPyError(); - }); -} - -Result> PyFileSystem::OpenInputStream( - const std::string& path) { - std::shared_ptr stream; - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.open_input_stream(handler_.obj(), path, &stream); - return CheckPyError(); - }); - RETURN_NOT_OK(st); - return stream; -} - -Result> PyFileSystem::OpenInputFile( - const std::string& path) { - std::shared_ptr stream; - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.open_input_file(handler_.obj(), path, &stream); - return CheckPyError(); - }); - RETURN_NOT_OK(st); - return stream; -} - -Result> PyFileSystem::OpenOutputStream( - const std::string& path, const std::shared_ptr& metadata) { - std::shared_ptr stream; - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.open_output_stream(handler_.obj(), path, metadata, &stream); - return CheckPyError(); - }); - RETURN_NOT_OK(st); - return stream; -} - -Result> PyFileSystem::OpenAppendStream( - const std::string& path, const std::shared_ptr& metadata) { - std::shared_ptr stream; - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.open_append_stream(handler_.obj(), path, metadata, &stream); - return CheckPyError(); - }); - RETURN_NOT_OK(st); - return stream; -} - -Result PyFileSystem::NormalizePath(std::string path) { - std::string normalized; - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.normalize_path(handler_.obj(), path, &normalized); - return CheckPyError(); - }); - RETURN_NOT_OK(st); - return normalized; -} - -} // namespace fs -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.h b/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.h deleted file mode 100644 index 003fd5c..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.h +++ /dev/null @@ -1,126 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/filesystem/filesystem.h" -#include "arrow/python/common.h" -#include "arrow/python/visibility.h" -#include "arrow/util/macros.h" - -namespace arrow { -namespace py { -namespace fs { - -class ARROW_PYTHON_EXPORT PyFileSystemVtable { - public: - std::function get_type_name; - std::function equals; - - std::function - get_file_info; - std::function& paths, - std::vector* out)> - get_file_info_vector; - std::function* out)> - get_file_info_selector; - - std::function create_dir; - std::function delete_dir; - std::function delete_dir_contents; - std::function delete_root_dir_contents; - std::function delete_file; - std::function move; - std::function - copy_file; - - std::function* out)> - open_input_stream; - std::function* out)> - open_input_file; - std::function&, - std::shared_ptr* out)> - open_output_stream; - std::function&, - std::shared_ptr* out)> - open_append_stream; - - std::function - normalize_path; -}; - -class ARROW_PYTHON_EXPORT PyFileSystem : public arrow::fs::FileSystem { - public: - PyFileSystem(PyObject* handler, PyFileSystemVtable vtable); - ~PyFileSystem() override; - - static std::shared_ptr Make(PyObject* handler, PyFileSystemVtable vtable); - - std::string type_name() const override; - - bool Equals(const FileSystem& other) const override; - - Result GetFileInfo(const std::string& path) override; - Result> GetFileInfo( - const std::vector& paths) override; - Result> GetFileInfo( - const arrow::fs::FileSelector& select) override; - - Status CreateDir(const std::string& path, bool recursive = true) override; - - Status DeleteDir(const std::string& path) override; - Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override; - Status DeleteRootDirContents() override; - - Status DeleteFile(const std::string& path) override; - - Status Move(const std::string& src, const std::string& dest) override; - - Status CopyFile(const std::string& src, const std::string& dest) override; - - Result> OpenInputStream( - const std::string& path) override; - Result> OpenInputFile( - const std::string& path) override; - Result> OpenOutputStream( - const std::string& path, - const std::shared_ptr& metadata = {}) override; - Result> OpenAppendStream( - const std::string& path, - const std::shared_ptr& metadata = {}) override; - - Result NormalizePath(std::string path) override; - - PyObject* handler() const { return handler_.obj(); } - - private: - OwnedRefNoGIL handler_; - PyFileSystemVtable vtable_; -}; - -} // namespace fs -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/flight.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/flight.cc deleted file mode 100644 index bf7af27..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/flight.cc +++ /dev/null @@ -1,388 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "arrow/python/flight.h" -#include "arrow/util/io_util.h" -#include "arrow/util/logging.h" - -using arrow::flight::FlightPayload; - -namespace arrow { -namespace py { -namespace flight { - -const char* kPyServerMiddlewareName = "arrow.py_server_middleware"; - -PyServerAuthHandler::PyServerAuthHandler(PyObject* handler, - const PyServerAuthHandlerVtable& vtable) - : vtable_(vtable) { - Py_INCREF(handler); - handler_.reset(handler); -} - -Status PyServerAuthHandler::Authenticate(arrow::flight::ServerAuthSender* outgoing, - arrow::flight::ServerAuthReader* incoming) { - return SafeCallIntoPython([=] { - const Status status = vtable_.authenticate(handler_.obj(), outgoing, incoming); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -Status PyServerAuthHandler::IsValid(const std::string& token, - std::string* peer_identity) { - return SafeCallIntoPython([=] { - const Status status = vtable_.is_valid(handler_.obj(), token, peer_identity); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -PyClientAuthHandler::PyClientAuthHandler(PyObject* handler, - const PyClientAuthHandlerVtable& vtable) - : vtable_(vtable) { - Py_INCREF(handler); - handler_.reset(handler); -} - -Status PyClientAuthHandler::Authenticate(arrow::flight::ClientAuthSender* outgoing, - arrow::flight::ClientAuthReader* incoming) { - return SafeCallIntoPython([=] { - const Status status = vtable_.authenticate(handler_.obj(), outgoing, incoming); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -Status PyClientAuthHandler::GetToken(std::string* token) { - return SafeCallIntoPython([=] { - const Status status = vtable_.get_token(handler_.obj(), token); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -PyFlightServer::PyFlightServer(PyObject* server, const PyFlightServerVtable& vtable) - : vtable_(vtable) { - Py_INCREF(server); - server_.reset(server); -} - -Status PyFlightServer::ListFlights( - const arrow::flight::ServerCallContext& context, - const arrow::flight::Criteria* criteria, - std::unique_ptr* listings) { - return SafeCallIntoPython([&] { - const Status status = - vtable_.list_flights(server_.obj(), context, criteria, listings); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -Status PyFlightServer::GetFlightInfo(const arrow::flight::ServerCallContext& context, - const arrow::flight::FlightDescriptor& request, - std::unique_ptr* info) { - return SafeCallIntoPython([&] { - const Status status = vtable_.get_flight_info(server_.obj(), context, request, info); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -Status PyFlightServer::GetSchema(const arrow::flight::ServerCallContext& context, - const arrow::flight::FlightDescriptor& request, - std::unique_ptr* result) { - return SafeCallIntoPython([&] { - const Status status = vtable_.get_schema(server_.obj(), context, request, result); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -Status PyFlightServer::DoGet(const arrow::flight::ServerCallContext& context, - const arrow::flight::Ticket& request, - std::unique_ptr* stream) { - return SafeCallIntoPython([&] { - const Status status = vtable_.do_get(server_.obj(), context, request, stream); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -Status PyFlightServer::DoPut( - const arrow::flight::ServerCallContext& context, - std::unique_ptr reader, - std::unique_ptr writer) { - return SafeCallIntoPython([&] { - const Status status = - vtable_.do_put(server_.obj(), context, std::move(reader), std::move(writer)); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -Status PyFlightServer::DoExchange( - const arrow::flight::ServerCallContext& context, - std::unique_ptr reader, - std::unique_ptr writer) { - return SafeCallIntoPython([&] { - const Status status = - vtable_.do_exchange(server_.obj(), context, std::move(reader), std::move(writer)); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -Status PyFlightServer::DoAction(const arrow::flight::ServerCallContext& context, - const arrow::flight::Action& action, - std::unique_ptr* result) { - return SafeCallIntoPython([&] { - const Status status = vtable_.do_action(server_.obj(), context, action, result); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -Status PyFlightServer::ListActions(const arrow::flight::ServerCallContext& context, - std::vector* actions) { - return SafeCallIntoPython([&] { - const Status status = vtable_.list_actions(server_.obj(), context, actions); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -Status PyFlightServer::ServeWithSignals() { - // Respect the current Python settings, i.e. only interrupt the server if there is - // an active signal handler for SIGINT and SIGTERM. - std::vector signals; - for (const int signum : {SIGINT, SIGTERM}) { - ARROW_ASSIGN_OR_RAISE(auto handler, ::arrow::internal::GetSignalHandler(signum)); - auto cb = handler.callback(); - if (cb != SIG_DFL && cb != SIG_IGN) { - signals.push_back(signum); - } - } - RETURN_NOT_OK(SetShutdownOnSignals(signals)); - - // Serve until we got told to shutdown or a signal interrupted us - RETURN_NOT_OK(Serve()); - int signum = GotSignal(); - if (signum != 0) { - // Issue the signal again with Python's signal handlers restored - PyAcquireGIL lock; - raise(signum); - // XXX Ideally we would loop and serve again if no exception was raised. - // Unfortunately, gRPC will return immediately if Serve() is called again. - ARROW_UNUSED(PyErr_CheckSignals()); - } - - return Status::OK(); -} - -PyFlightResultStream::PyFlightResultStream(PyObject* generator, - PyFlightResultStreamCallback callback) - : callback_(callback) { - Py_INCREF(generator); - generator_.reset(generator); -} - -arrow::Result> PyFlightResultStream::Next() { - return SafeCallIntoPython( - [=]() -> arrow::Result> { - std::unique_ptr result; - const Status status = callback_(generator_.obj(), &result); - RETURN_NOT_OK(CheckPyError()); - RETURN_NOT_OK(status); - return result; - }); -} - -PyFlightDataStream::PyFlightDataStream( - PyObject* data_source, std::unique_ptr stream) - : stream_(std::move(stream)) { - Py_INCREF(data_source); - data_source_.reset(data_source); -} - -std::shared_ptr PyFlightDataStream::schema() { return stream_->schema(); } - -arrow::Result PyFlightDataStream::GetSchemaPayload() { - return stream_->GetSchemaPayload(); -} - -arrow::Result PyFlightDataStream::Next() { return stream_->Next(); } - -PyGeneratorFlightDataStream::PyGeneratorFlightDataStream( - PyObject* generator, std::shared_ptr schema, - PyGeneratorFlightDataStreamCallback callback, const ipc::IpcWriteOptions& options) - : schema_(schema), mapper_(*schema_), options_(options), callback_(callback) { - Py_INCREF(generator); - generator_.reset(generator); -} - -std::shared_ptr PyGeneratorFlightDataStream::schema() { return schema_; } - -arrow::Result PyGeneratorFlightDataStream::GetSchemaPayload() { - FlightPayload payload; - RETURN_NOT_OK(ipc::GetSchemaPayload(*schema_, options_, mapper_, &payload.ipc_message)); - return payload; -} - -arrow::Result PyGeneratorFlightDataStream::Next() { - return SafeCallIntoPython([=]() -> arrow::Result { - FlightPayload payload; - const Status status = callback_(generator_.obj(), &payload); - RETURN_NOT_OK(CheckPyError()); - RETURN_NOT_OK(status); - return payload; - }); -} - -// Flight Server Middleware - -PyServerMiddlewareFactory::PyServerMiddlewareFactory(PyObject* factory, - StartCallCallback start_call) - : start_call_(start_call) { - Py_INCREF(factory); - factory_.reset(factory); -} - -Status PyServerMiddlewareFactory::StartCall( - const arrow::flight::CallInfo& info, - const arrow::flight::CallHeaders& incoming_headers, - std::shared_ptr* middleware) { - return SafeCallIntoPython([&] { - const Status status = start_call_(factory_.obj(), info, incoming_headers, middleware); - RETURN_NOT_OK(CheckPyError()); - return status; - }); -} - -PyServerMiddleware::PyServerMiddleware(PyObject* middleware, Vtable vtable) - : vtable_(vtable) { - Py_INCREF(middleware); - middleware_.reset(middleware); -} - -void PyServerMiddleware::SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) { - const Status& status = SafeCallIntoPython([&] { - const Status status = vtable_.sending_headers(middleware_.obj(), outgoing_headers); - RETURN_NOT_OK(CheckPyError()); - return status; - }); - - ARROW_WARN_NOT_OK(status, "Python server middleware failed in SendingHeaders"); -} - -void PyServerMiddleware::CallCompleted(const Status& call_status) { - const Status& status = SafeCallIntoPython([&] { - const Status status = vtable_.call_completed(middleware_.obj(), call_status); - RETURN_NOT_OK(CheckPyError()); - return status; - }); - - ARROW_WARN_NOT_OK(status, "Python server middleware failed in CallCompleted"); -} - -std::string PyServerMiddleware::name() const { return kPyServerMiddlewareName; } - -PyObject* PyServerMiddleware::py_object() const { return middleware_.obj(); } - -// Flight Client Middleware - -PyClientMiddlewareFactory::PyClientMiddlewareFactory(PyObject* factory, - StartCallCallback start_call) - : start_call_(start_call) { - Py_INCREF(factory); - factory_.reset(factory); -} - -void PyClientMiddlewareFactory::StartCall( - const arrow::flight::CallInfo& info, - std::unique_ptr* middleware) { - const Status& status = SafeCallIntoPython([&] { - const Status status = start_call_(factory_.obj(), info, middleware); - RETURN_NOT_OK(CheckPyError()); - return status; - }); - - ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall"); -} - -PyClientMiddleware::PyClientMiddleware(PyObject* middleware, Vtable vtable) - : vtable_(vtable) { - Py_INCREF(middleware); - middleware_.reset(middleware); -} - -void PyClientMiddleware::SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) { - const Status& status = SafeCallIntoPython([&] { - const Status status = vtable_.sending_headers(middleware_.obj(), outgoing_headers); - RETURN_NOT_OK(CheckPyError()); - return status; - }); - - ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall"); -} - -void PyClientMiddleware::ReceivedHeaders( - const arrow::flight::CallHeaders& incoming_headers) { - const Status& status = SafeCallIntoPython([&] { - const Status status = vtable_.received_headers(middleware_.obj(), incoming_headers); - RETURN_NOT_OK(CheckPyError()); - return status; - }); - - ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall"); -} - -void PyClientMiddleware::CallCompleted(const Status& call_status) { - const Status& status = SafeCallIntoPython([&] { - const Status status = vtable_.call_completed(middleware_.obj(), call_status); - RETURN_NOT_OK(CheckPyError()); - return status; - }); - - ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall"); -} - -Status CreateFlightInfo(const std::shared_ptr& schema, - const arrow::flight::FlightDescriptor& descriptor, - const std::vector& endpoints, - int64_t total_records, int64_t total_bytes, - std::unique_ptr* out) { - ARROW_ASSIGN_OR_RAISE(auto result, - arrow::flight::FlightInfo::Make(*schema, descriptor, endpoints, - total_records, total_bytes)); - *out = std::unique_ptr( - new arrow::flight::FlightInfo(std::move(result))); - return Status::OK(); -} - -Status CreateSchemaResult(const std::shared_ptr& schema, - std::unique_ptr* out) { - return arrow::flight::SchemaResult::Make(*schema).Value(out); -} - -} // namespace flight -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/flight.h b/src/vendored/apache-arrow-12.0.1/arrow/python/flight.h deleted file mode 100644 index 82d9371..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/flight.h +++ /dev/null @@ -1,350 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/flight/api.h" -#include "arrow/ipc/dictionary.h" -#include "arrow/python/common.h" - -#if defined(_WIN32) || defined(__CYGWIN__) // Windows -#if defined(_MSC_VER) -#pragma warning(disable : 4251) -#else -#pragma GCC diagnostic ignored "-Wattributes" -#endif - -#ifdef ARROW_PYTHON_STATIC -#define ARROW_PYFLIGHT_EXPORT -#elif defined(ARROW_PYFLIGHT_EXPORTING) -#define ARROW_PYFLIGHT_EXPORT __declspec(dllexport) -#else -#define ARROW_PYFLIGHT_EXPORT __declspec(dllimport) -#endif - -#else // Not Windows -#ifndef ARROW_PYFLIGHT_EXPORT -#define ARROW_PYFLIGHT_EXPORT __attribute__((visibility("default"))) -#endif -#endif // Non-Windows - -namespace arrow { - -namespace py { - -namespace flight { - -ARROW_PYFLIGHT_EXPORT -extern const char* kPyServerMiddlewareName; - -/// \brief A table of function pointers for calling from C++ into -/// Python. -class ARROW_PYFLIGHT_EXPORT PyFlightServerVtable { - public: - std::function*)> - list_flights; - std::function*)> - get_flight_info; - std::function*)> - get_schema; - std::function*)> - do_get; - std::function, - std::unique_ptr)> - do_put; - std::function, - std::unique_ptr)> - do_exchange; - std::function*)> - do_action; - std::function*)> - list_actions; -}; - -class ARROW_PYFLIGHT_EXPORT PyServerAuthHandlerVtable { - public: - std::function - authenticate; - std::function is_valid; -}; - -class ARROW_PYFLIGHT_EXPORT PyClientAuthHandlerVtable { - public: - std::function - authenticate; - std::function get_token; -}; - -/// \brief A helper to implement an auth mechanism in Python. -class ARROW_PYFLIGHT_EXPORT PyServerAuthHandler - : public arrow::flight::ServerAuthHandler { - public: - explicit PyServerAuthHandler(PyObject* handler, - const PyServerAuthHandlerVtable& vtable); - Status Authenticate(arrow::flight::ServerAuthSender* outgoing, - arrow::flight::ServerAuthReader* incoming) override; - Status IsValid(const std::string& token, std::string* peer_identity) override; - - private: - OwnedRefNoGIL handler_; - PyServerAuthHandlerVtable vtable_; -}; - -/// \brief A helper to implement an auth mechanism in Python. -class ARROW_PYFLIGHT_EXPORT PyClientAuthHandler - : public arrow::flight::ClientAuthHandler { - public: - explicit PyClientAuthHandler(PyObject* handler, - const PyClientAuthHandlerVtable& vtable); - Status Authenticate(arrow::flight::ClientAuthSender* outgoing, - arrow::flight::ClientAuthReader* incoming) override; - Status GetToken(std::string* token) override; - - private: - OwnedRefNoGIL handler_; - PyClientAuthHandlerVtable vtable_; -}; - -class ARROW_PYFLIGHT_EXPORT PyFlightServer : public arrow::flight::FlightServerBase { - public: - explicit PyFlightServer(PyObject* server, const PyFlightServerVtable& vtable); - - // Like Serve(), but set up signals and invoke Python signal handlers - // if necessary. This function may return with a Python exception set. - Status ServeWithSignals(); - - Status ListFlights(const arrow::flight::ServerCallContext& context, - const arrow::flight::Criteria* criteria, - std::unique_ptr* listings) override; - Status GetFlightInfo(const arrow::flight::ServerCallContext& context, - const arrow::flight::FlightDescriptor& request, - std::unique_ptr* info) override; - Status GetSchema(const arrow::flight::ServerCallContext& context, - const arrow::flight::FlightDescriptor& request, - std::unique_ptr* result) override; - Status DoGet(const arrow::flight::ServerCallContext& context, - const arrow::flight::Ticket& request, - std::unique_ptr* stream) override; - Status DoPut(const arrow::flight::ServerCallContext& context, - std::unique_ptr reader, - std::unique_ptr writer) override; - Status DoExchange(const arrow::flight::ServerCallContext& context, - std::unique_ptr reader, - std::unique_ptr writer) override; - Status DoAction(const arrow::flight::ServerCallContext& context, - const arrow::flight::Action& action, - std::unique_ptr* result) override; - Status ListActions(const arrow::flight::ServerCallContext& context, - std::vector* actions) override; - - private: - OwnedRefNoGIL server_; - PyFlightServerVtable vtable_; -}; - -/// \brief A callback that obtains the next result from a Flight action. -typedef std::function*)> - PyFlightResultStreamCallback; - -/// \brief A ResultStream built around a Python callback. -class ARROW_PYFLIGHT_EXPORT PyFlightResultStream : public arrow::flight::ResultStream { - public: - /// \brief Construct a FlightResultStream from a Python object and callback. - /// Must only be called while holding the GIL. - explicit PyFlightResultStream(PyObject* generator, - PyFlightResultStreamCallback callback); - arrow::Result> Next() override; - - private: - OwnedRefNoGIL generator_; - PyFlightResultStreamCallback callback_; -}; - -/// \brief A wrapper around a FlightDataStream that keeps alive a -/// Python object backing it. -class ARROW_PYFLIGHT_EXPORT PyFlightDataStream : public arrow::flight::FlightDataStream { - public: - /// \brief Construct a FlightDataStream from a Python object and underlying stream. - /// Must only be called while holding the GIL. - explicit PyFlightDataStream(PyObject* data_source, - std::unique_ptr stream); - - std::shared_ptr schema() override; - arrow::Result GetSchemaPayload() override; - arrow::Result Next() override; - - private: - OwnedRefNoGIL data_source_; - std::unique_ptr stream_; -}; - -class ARROW_PYFLIGHT_EXPORT PyServerMiddlewareFactory - : public arrow::flight::ServerMiddlewareFactory { - public: - /// \brief A callback to create the middleware instance in Python - typedef std::function* middleware)> - StartCallCallback; - - /// \brief Must only be called while holding the GIL. - explicit PyServerMiddlewareFactory(PyObject* factory, StartCallCallback start_call); - - Status StartCall(const arrow::flight::CallInfo& info, - const arrow::flight::CallHeaders& incoming_headers, - std::shared_ptr* middleware) override; - - private: - OwnedRefNoGIL factory_; - StartCallCallback start_call_; -}; - -class ARROW_PYFLIGHT_EXPORT PyServerMiddleware : public arrow::flight::ServerMiddleware { - public: - typedef std::function - SendingHeadersCallback; - typedef std::function CallCompletedCallback; - - struct Vtable { - SendingHeadersCallback sending_headers; - CallCompletedCallback call_completed; - }; - - /// \brief Must only be called while holding the GIL. - explicit PyServerMiddleware(PyObject* middleware, Vtable vtable); - - void SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) override; - void CallCompleted(const Status& status) override; - std::string name() const override; - /// \brief Get the underlying Python object. - PyObject* py_object() const; - - private: - OwnedRefNoGIL middleware_; - Vtable vtable_; -}; - -class ARROW_PYFLIGHT_EXPORT PyClientMiddlewareFactory - : public arrow::flight::ClientMiddlewareFactory { - public: - /// \brief A callback to create the middleware instance in Python - typedef std::function* middleware)> - StartCallCallback; - - /// \brief Must only be called while holding the GIL. - explicit PyClientMiddlewareFactory(PyObject* factory, StartCallCallback start_call); - - void StartCall(const arrow::flight::CallInfo& info, - std::unique_ptr* middleware) override; - - private: - OwnedRefNoGIL factory_; - StartCallCallback start_call_; -}; - -class ARROW_PYFLIGHT_EXPORT PyClientMiddleware : public arrow::flight::ClientMiddleware { - public: - typedef std::function - SendingHeadersCallback; - typedef std::function - ReceivedHeadersCallback; - typedef std::function CallCompletedCallback; - - struct Vtable { - SendingHeadersCallback sending_headers; - ReceivedHeadersCallback received_headers; - CallCompletedCallback call_completed; - }; - - /// \brief Must only be called while holding the GIL. - explicit PyClientMiddleware(PyObject* factory, Vtable vtable); - - void SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) override; - void ReceivedHeaders(const arrow::flight::CallHeaders& incoming_headers) override; - void CallCompleted(const Status& status) override; - - private: - OwnedRefNoGIL middleware_; - Vtable vtable_; -}; - -/// \brief A callback that obtains the next payload from a Flight result stream. -typedef std::function - PyGeneratorFlightDataStreamCallback; - -/// \brief A FlightDataStream built around a Python callback. -class ARROW_PYFLIGHT_EXPORT PyGeneratorFlightDataStream - : public arrow::flight::FlightDataStream { - public: - /// \brief Construct a FlightDataStream from a Python object and underlying stream. - /// Must only be called while holding the GIL. - explicit PyGeneratorFlightDataStream(PyObject* generator, - std::shared_ptr schema, - PyGeneratorFlightDataStreamCallback callback, - const ipc::IpcWriteOptions& options); - std::shared_ptr schema() override; - arrow::Result GetSchemaPayload() override; - arrow::Result Next() override; - - private: - OwnedRefNoGIL generator_; - std::shared_ptr schema_; - ipc::DictionaryFieldMapper mapper_; - ipc::IpcWriteOptions options_; - PyGeneratorFlightDataStreamCallback callback_; -}; - -ARROW_PYFLIGHT_EXPORT -Status CreateFlightInfo(const std::shared_ptr& schema, - const arrow::flight::FlightDescriptor& descriptor, - const std::vector& endpoints, - int64_t total_records, int64_t total_bytes, - std::unique_ptr* out); - -/// \brief Create a SchemaResult from schema. -ARROW_PYFLIGHT_EXPORT -Status CreateSchemaResult(const std::shared_ptr& schema, - std::unique_ptr* out); - -} // namespace flight -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/gdb.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/gdb.cc deleted file mode 100644 index 6941769..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/gdb.cc +++ /dev/null @@ -1,530 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/chunked_array.h" -#include "arrow/datum.h" -#include "arrow/extension_type.h" -#include "arrow/ipc/json_simple.h" -#include "arrow/python/gdb.h" -#include "arrow/record_batch.h" -#include "arrow/scalar.h" -#include "arrow/table.h" -#include "arrow/type.h" -#include "arrow/util/debug.h" -#include "arrow/util/decimal.h" -#include "arrow/util/key_value_metadata.h" -#include "arrow/util/logging.h" -#include "arrow/util/macros.h" - -namespace arrow { - -using ipc::internal::json::ArrayFromJSON; -using ipc::internal::json::ChunkedArrayFromJSON; -using ipc::internal::json::ScalarFromJSON; - -namespace gdb { - -// Add a nested `arrow` namespace to exercise type lookup from GDB (ARROW-15652) -namespace arrow { -void DummyFunction() {} -} // namespace arrow - -namespace { - -class CustomStatusDetail : public StatusDetail { - public: - const char* type_id() const override { return "custom-detail-id"; } - std::string ToString() const override { return "This is a detail"; } -}; - -class UuidType : public ExtensionType { - public: - UuidType() : ExtensionType(fixed_size_binary(16)) {} - - std::string extension_name() const override { return "uuid"; } - - bool ExtensionEquals(const ExtensionType& other) const override { - return (other.extension_name() == this->extension_name()); - } - - std::shared_ptr MakeArray(std::shared_ptr data) const override { - return std::make_shared(data); - } - - Result> Deserialize( - std::shared_ptr storage_type, - const std::string& serialized) const override { - return Status::NotImplemented(""); - } - - std::string Serialize() const override { return "uuid-serialized"; } -}; - -std::shared_ptr SliceArrayFromJSON(const std::shared_ptr& ty, - std::string_view json, int64_t offset = 0, - int64_t length = -1) { - auto array = *ArrayFromJSON(ty, json); - if (length != -1) { - return array->Slice(offset, length); - } else { - return array->Slice(offset); - } -} - -} // namespace - -void TestSession() { - // We define local variables for all types for which we want to test - // pretty-printing. - // Then, at the end of this function, we trap to the debugger, so that - // test instrumentation can print values from this frame by interacting - // with the debugger. - // The test instrumentation is in pyarrow/tests/test_gdb.py - -#ifdef __clang__ - _Pragma("clang diagnostic push"); - _Pragma("clang diagnostic ignored \"-Wunused-variable\""); -#elif defined(__GNUC__) - _Pragma("GCC diagnostic push"); - _Pragma("GCC diagnostic ignored \"-Wunused-variable\""); -#endif - - arrow::DummyFunction(); - - // Status & Result - auto ok_status = Status::OK(); - auto error_status = Status::IOError("This is an error"); - auto error_detail_status = - error_status.WithDetail(std::make_shared()); - auto ok_result = Result(42); - auto error_result = Result(error_status); - auto error_detail_result = Result(error_detail_status); - - // String views - std::string_view string_view_abc{"abc"}; - std::string special_chars = std::string("foo\"bar") + '\x00' + "\r\n\t\x1f"; - std::string_view string_view_special_chars(special_chars); - - // Buffers - Buffer buffer_null{nullptr, 0}; - Buffer buffer_abc{string_view_abc}; - Buffer buffer_special_chars{string_view_special_chars}; - char mutable_array[3] = {'a', 'b', 'c'}; - MutableBuffer buffer_mutable{reinterpret_cast(mutable_array), 3}; - auto heap_buffer = std::make_shared(string_view_abc); - auto heap_buffer_mutable = *AllocateBuffer(buffer_abc.size()); - memcpy(heap_buffer_mutable->mutable_data(), buffer_abc.data(), buffer_abc.size()); - - // KeyValueMetadata - auto empty_metadata = key_value_metadata({}, {}); - auto metadata = key_value_metadata( - {"key_text", "key_binary"}, {"some value", std::string("z") + '\x00' + "\x1f\xff"}); - - // Decimals - Decimal128 decimal128_zero{}; - Decimal128 decimal128_pos{"98765432109876543210987654321098765432"}; - Decimal128 decimal128_neg{"-98765432109876543210987654321098765432"}; - BasicDecimal128 basic_decimal128_zero{}; - BasicDecimal128 basic_decimal128_pos{decimal128_pos.native_endian_array()}; - BasicDecimal128 basic_decimal128_neg{decimal128_neg.native_endian_array()}; - Decimal256 decimal256_zero{}; - Decimal256 decimal256_pos{ - "9876543210987654321098765432109876543210987654321098765432109876543210987654"}; - Decimal256 decimal256_neg{ - "-9876543210987654321098765432109876543210987654321098765432109876543210987654"}; - BasicDecimal256 basic_decimal256_zero{}; - BasicDecimal256 basic_decimal256_pos{decimal256_pos.native_endian_array()}; - BasicDecimal256 basic_decimal256_neg{decimal256_neg.native_endian_array()}; - - // Data types - NullType null_type; - auto heap_null_type = null(); - BooleanType bool_type; - auto heap_bool_type = boolean(); - - Date32Type date32_type; - Date64Type date64_type; - Time32Type time_type_s(TimeUnit::SECOND); - Time32Type time_type_ms(TimeUnit::MILLI); - Time64Type time_type_us(TimeUnit::MICRO); - Time64Type time_type_ns(TimeUnit::NANO); - auto heap_time_type_ns = time64(TimeUnit::NANO); - - TimestampType timestamp_type_s(TimeUnit::SECOND); - TimestampType timestamp_type_ms_timezone(TimeUnit::MILLI, "Europe/Paris"); - TimestampType timestamp_type_us(TimeUnit::MICRO); - TimestampType timestamp_type_ns_timezone(TimeUnit::NANO, "Europe/Paris"); - auto heap_timestamp_type_ns_timezone = timestamp(TimeUnit::NANO, "Europe/Paris"); - - DayTimeIntervalType day_time_interval_type; - MonthIntervalType month_interval_type; - MonthDayNanoIntervalType month_day_nano_interval_type; - - DurationType duration_type_s(TimeUnit::SECOND); - DurationType duration_type_ns(TimeUnit::NANO); - - BinaryType binary_type; - StringType string_type; - LargeBinaryType large_binary_type; - LargeStringType large_string_type; - FixedSizeBinaryType fixed_size_binary_type(10); - auto heap_fixed_size_binary_type = fixed_size_binary(10); - - Decimal128Type decimal128_type(16, 5); - Decimal256Type decimal256_type(42, 12); - auto heap_decimal128_type = decimal128(16, 5); - - ListType list_type(uint8()); - LargeListType large_list_type(large_utf8()); - auto heap_list_type = list(uint8()); - auto heap_large_list_type = large_list(large_utf8()); - - FixedSizeListType fixed_size_list_type(float64(), 3); - auto heap_fixed_size_list_type = fixed_size_list(float64(), 3); - - DictionaryType dict_type_unordered(int16(), utf8()); - DictionaryType dict_type_ordered(int16(), utf8(), /*ordered=*/true); - auto heap_dict_type = dictionary(int16(), utf8()); - - MapType map_type_unsorted(utf8(), binary()); - MapType map_type_sorted(utf8(), binary(), /*keys_sorted=*/true); - auto heap_map_type = map(utf8(), binary()); - - StructType struct_type_empty({}); - StructType struct_type( - {field("ints", int8()), field("strs", utf8(), /*nullable=*/false)}); - auto heap_struct_type = - struct_({field("ints", int8()), field("strs", utf8(), /*nullable=*/false)}); - - std::vector union_type_codes({7, 42}); - FieldVector union_fields( - {field("ints", int8()), field("strs", utf8(), /*nullable=*/false)}); - SparseUnionType sparse_union_type(union_fields, union_type_codes); - DenseUnionType dense_union_type(union_fields, union_type_codes); - - UuidType uuid_type{}; - std::shared_ptr heap_uuid_type = std::make_shared(); - - // Schema - auto schema_empty = schema({}); - auto schema_non_empty = schema({field("ints", int8()), field("strs", utf8())}); - auto schema_with_metadata = schema_non_empty->WithMetadata( - key_value_metadata({"key1", "key2"}, {"value1", "value2"})); - - // Fields - Field int_field("ints", int64()); - Field float_field("floats", float32(), /*nullable=*/false); - auto heap_int_field = field("ints", int64()); - - // Scalars - NullScalar null_scalar; - auto heap_null_scalar = MakeNullScalar(null()); - - BooleanScalar bool_scalar_null{}; - BooleanScalar bool_scalar{true}; - auto heap_bool_scalar = *MakeScalar(boolean(), true); - - Int8Scalar int8_scalar_null{}; - UInt8Scalar uint8_scalar_null{}; - Int64Scalar int64_scalar_null{}; - UInt64Scalar uint64_scalar_null{}; - Int8Scalar int8_scalar{-42}; - UInt8Scalar uint8_scalar{234}; - Int64Scalar int64_scalar{-9223372036854775807LL - 1}; - UInt64Scalar uint64_scalar{18446744073709551615ULL}; - HalfFloatScalar half_float_scalar{48640}; // -1.5 - FloatScalar float_scalar{1.25f}; - DoubleScalar double_scalar{2.5}; - - Time32Scalar time_scalar_s{100, TimeUnit::SECOND}; - Time32Scalar time_scalar_ms{1000, TimeUnit::MILLI}; - Time64Scalar time_scalar_us{10000, TimeUnit::MICRO}; - Time64Scalar time_scalar_ns{100000, TimeUnit::NANO}; - Time64Scalar time_scalar_null{time64(TimeUnit::NANO)}; - - DurationScalar duration_scalar_s{-100, TimeUnit::SECOND}; - DurationScalar duration_scalar_ms{-1000, TimeUnit::MILLI}; - DurationScalar duration_scalar_us{-10000, TimeUnit::MICRO}; - DurationScalar duration_scalar_ns{-100000, TimeUnit::NANO}; - DurationScalar duration_scalar_null{duration(TimeUnit::NANO)}; - - TimestampScalar timestamp_scalar_s{12345, timestamp(TimeUnit::SECOND)}; - TimestampScalar timestamp_scalar_ms{-123456, timestamp(TimeUnit::MILLI)}; - TimestampScalar timestamp_scalar_us{1234567, timestamp(TimeUnit::MICRO)}; - TimestampScalar timestamp_scalar_ns{-12345678, timestamp(TimeUnit::NANO)}; - TimestampScalar timestamp_scalar_null{timestamp(TimeUnit::NANO)}; - - TimestampScalar timestamp_scalar_s_tz{12345, - timestamp(TimeUnit::SECOND, "Europe/Paris")}; - TimestampScalar timestamp_scalar_ms_tz{-123456, - timestamp(TimeUnit::MILLI, "Europe/Paris")}; - TimestampScalar timestamp_scalar_us_tz{1234567, - timestamp(TimeUnit::MICRO, "Europe/Paris")}; - TimestampScalar timestamp_scalar_ns_tz{-12345678, - timestamp(TimeUnit::NANO, "Europe/Paris")}; - TimestampScalar timestamp_scalar_null_tz{timestamp(TimeUnit::NANO, "Europe/Paris")}; - - MonthIntervalScalar month_interval_scalar{23}; - MonthIntervalScalar month_interval_scalar_null{}; - DayTimeIntervalScalar day_time_interval_scalar{{23, -456}}; - DayTimeIntervalScalar day_time_interval_scalar_null{}; - MonthDayNanoIntervalScalar month_day_nano_interval_scalar{{1, 23, -456}}; - MonthDayNanoIntervalScalar month_day_nano_interval_scalar_null{}; - - Date32Scalar date32_scalar{23}; - Date32Scalar date32_scalar_null{}; - Date64Scalar date64_scalar{45 * 86400000LL}; - Date64Scalar date64_scalar_null{}; - - Decimal128Scalar decimal128_scalar_pos_scale_pos{Decimal128("1234567"), - decimal128(10, 4)}; - Decimal128Scalar decimal128_scalar_pos_scale_neg{Decimal128("-1234567"), - decimal128(10, 4)}; - Decimal128Scalar decimal128_scalar_neg_scale_pos{Decimal128("1234567"), - decimal128(10, -4)}; - Decimal128Scalar decimal128_scalar_neg_scale_neg{Decimal128("-1234567"), - decimal128(10, -4)}; - Decimal128Scalar decimal128_scalar_null{decimal128(10, 4)}; - auto heap_decimal128_scalar = *MakeScalar(decimal128(10, 4), Decimal128("1234567")); - - Decimal256Scalar decimal256_scalar_pos_scale_pos{ - Decimal256("1234567890123456789012345678901234567890123456"), decimal256(50, 4)}; - Decimal256Scalar decimal256_scalar_pos_scale_neg{ - Decimal256("-1234567890123456789012345678901234567890123456"), decimal256(50, 4)}; - Decimal256Scalar decimal256_scalar_neg_scale_pos{ - Decimal256("1234567890123456789012345678901234567890123456"), decimal256(50, -4)}; - Decimal256Scalar decimal256_scalar_neg_scale_neg{ - Decimal256("-1234567890123456789012345678901234567890123456"), decimal256(50, -4)}; - Decimal256Scalar decimal256_scalar_null{decimal256(50, 4)}; - auto heap_decimal256_scalar = *MakeScalar( - decimal256(50, 4), Decimal256("1234567890123456789012345678901234567890123456")); - - BinaryScalar binary_scalar_null{}; - BinaryScalar binary_scalar_unallocated{std::shared_ptr{nullptr}}; - BinaryScalar binary_scalar_empty{Buffer::FromString("")}; - BinaryScalar binary_scalar_abc{Buffer::FromString("abc")}; - BinaryScalar binary_scalar_bytes{ - Buffer::FromString(std::string() + '\x00' + "\x1f\xff")}; - - StringScalar string_scalar_null{}; - StringScalar string_scalar_unallocated{std::shared_ptr{nullptr}}; - StringScalar string_scalar_empty{Buffer::FromString("")}; - StringScalar string_scalar_hehe{Buffer::FromString("héhé")}; - StringScalar string_scalar_invalid_chars{ - Buffer::FromString(std::string("abc") + '\x00' + "def\xffghi")}; - - LargeBinaryScalar large_binary_scalar_abc{Buffer::FromString("abc")}; - LargeStringScalar large_string_scalar_hehe{Buffer::FromString("héhé")}; - - FixedSizeBinaryScalar fixed_size_binary_scalar{Buffer::FromString("abc"), - fixed_size_binary(3)}; - FixedSizeBinaryScalar fixed_size_binary_scalar_null{ - Buffer::FromString(" "), fixed_size_binary(3), /*is_valid=*/false}; - - std::shared_ptr dict_array; - dict_array = *ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"); - DictionaryScalar dict_scalar{{std::make_shared(42), dict_array}, - dictionary(int8(), utf8())}; - DictionaryScalar dict_scalar_null{dictionary(int8(), utf8())}; - - std::shared_ptr list_value_array = *ArrayFromJSON(int32(), R"([4, 5, 6])"); - std::shared_ptr list_zero_length = *ArrayFromJSON(int32(), R"([])"); - ListScalar list_scalar{list_value_array}; - ListScalar list_scalar_null{list_zero_length, list(int32()), /*is_valid=*/false}; - LargeListScalar large_list_scalar{list_value_array}; - LargeListScalar large_list_scalar_null{list_zero_length, large_list(int32()), - /*is_valid=*/false}; - FixedSizeListScalar fixed_size_list_scalar{list_value_array}; - FixedSizeListScalar fixed_size_list_scalar_null{ - list_value_array, fixed_size_list(int32(), 3), /*is_valid=*/false}; - - auto struct_scalar_type = struct_({field("ints", int32()), field("strs", utf8())}); - StructScalar struct_scalar{ - ScalarVector{MakeScalar(int32_t(42)), MakeScalar("some text")}, struct_scalar_type}; - StructScalar struct_scalar_null{struct_scalar.value, struct_scalar_type, - /*is_valid=*/false}; - - auto sparse_union_scalar_type = - sparse_union(FieldVector{field("ints", int32()), field("strs", utf8())}, {7, 42}); - auto dense_union_scalar_type = - dense_union(FieldVector{field("ints", int32()), field("strs", utf8())}, {7, 42}); - std::vector> union_values = {MakeScalar(int32_t(43)), - MakeNullScalar(utf8())}; - SparseUnionScalar sparse_union_scalar{union_values, 7, sparse_union_scalar_type}; - DenseUnionScalar dense_union_scalar{union_values[0], 7, dense_union_scalar_type}; - - union_values[0] = MakeNullScalar(int32()); - SparseUnionScalar sparse_union_scalar_null{union_values, 7, sparse_union_scalar_type}; - DenseUnionScalar dense_union_scalar_null{union_values[0], 7, dense_union_scalar_type}; - - auto extension_scalar_type = std::make_shared(); - ExtensionScalar extension_scalar{ - std::make_shared(Buffer::FromString("0123456789abcdef"), - extension_scalar_type->storage_type()), - extension_scalar_type}; - ExtensionScalar extension_scalar_null{extension_scalar.value, extension_scalar_type, - /*is_valid=*/false}; - - std::shared_ptr heap_map_scalar; - ARROW_CHECK_OK( - ScalarFromJSON(map(utf8(), int32()), R"([["a", 5], ["b", 6]])", &heap_map_scalar)); - auto heap_map_scalar_null = MakeNullScalar(heap_map_scalar->type); - - // Array and ArrayData - auto heap_null_array = SliceArrayFromJSON(null(), "[null, null]"); - - auto heap_int32_array = SliceArrayFromJSON(int32(), "[-5, 6, null, 42]"); - ArrayData int32_array_data{*heap_int32_array->data()}; - Int32Array int32_array{heap_int32_array->data()->Copy()}; - - auto heap_int32_array_no_nulls = SliceArrayFromJSON(int32(), "[-5, 6, 3, 42]"); - - const char* json_int32_array = "[-1, 2, -3, 4, null, -5, 6, -7, 8, null, -9, -10]"; - auto heap_int32_array_sliced_1_9 = SliceArrayFromJSON(int32(), json_int32_array, 1, 9); - auto heap_int32_array_sliced_2_6 = SliceArrayFromJSON(int32(), json_int32_array, 2, 6); - auto heap_int32_array_sliced_8_4 = SliceArrayFromJSON(int32(), json_int32_array, 8, 4); - auto heap_int32_array_sliced_empty = - SliceArrayFromJSON(int32(), json_int32_array, 6, 0); - - const char* json_bool_array = - "[false, false, true, true, null, null, false, false, true, true, " - "null, null, false, false, true, true, null, null]"; - auto heap_bool_array = SliceArrayFromJSON(boolean(), json_bool_array); - auto heap_bool_array_sliced_1_9 = SliceArrayFromJSON(boolean(), json_bool_array, 1, 9); - auto heap_bool_array_sliced_2_6 = SliceArrayFromJSON(boolean(), json_bool_array, 2, 6); - auto heap_bool_array_sliced_empty = - SliceArrayFromJSON(boolean(), json_bool_array, 6, 0); - - auto heap_list_array = SliceArrayFromJSON(list(int64()), "[[1, 2], null, []]"); - ListArray list_array{heap_list_array->data()}; - - const char* json_double_array = "[-1.5, null]"; - auto heap_double_array = SliceArrayFromJSON(float64(), json_double_array); - - const char* json_float16_array = "[0, 48640]"; - auto heap_float16_array = - *SliceArrayFromJSON(uint16(), json_float16_array)->View(float16()); - - auto heap_date32_array = - SliceArrayFromJSON(date32(), "[0, null, 18336, -9004, -719162, -719163]"); - auto heap_date64_array = SliceArrayFromJSON( - date64(), "[1584230400000, -777945600000, -62135596800000, -62135683200000, 123]"); - - const char* json_time_array = "[null, -123, 456]"; - auto heap_time32_array_s = - SliceArrayFromJSON(time32(TimeUnit::SECOND), json_time_array); - auto heap_time32_array_ms = - SliceArrayFromJSON(time32(TimeUnit::MILLI), json_time_array); - auto heap_time64_array_us = - SliceArrayFromJSON(time64(TimeUnit::MICRO), json_time_array); - auto heap_time64_array_ns = SliceArrayFromJSON(time64(TimeUnit::NANO), json_time_array); - - auto heap_month_interval_array = - SliceArrayFromJSON(month_interval(), "[123, -456, null]"); - auto heap_day_time_interval_array = - SliceArrayFromJSON(day_time_interval(), "[[1, -600], null]"); - auto heap_month_day_nano_interval_array = - SliceArrayFromJSON(month_day_nano_interval(), "[[1, -600, 5000], null]"); - - const char* json_duration_array = "[null, -1234567890123456789]"; - auto heap_duration_array_s = - SliceArrayFromJSON(duration(TimeUnit::SECOND), json_duration_array); - auto heap_duration_array_ns = - SliceArrayFromJSON(duration(TimeUnit::NANO), json_duration_array); - - auto heap_timestamp_array_s = SliceArrayFromJSON( - timestamp(TimeUnit::SECOND), - R"([null, "1970-01-01 00:00:00", "1900-02-28 12:34:56", "3989-07-14 00:00:00"])"); - auto heap_timestamp_array_ms = SliceArrayFromJSON( - timestamp(TimeUnit::MILLI), - R"([null, "1900-02-28 12:34:56.123", "3989-07-14 00:00:00.789"])"); - auto heap_timestamp_array_us = SliceArrayFromJSON( - timestamp(TimeUnit::MICRO), - R"([null, "1900-02-28 12:34:56.654321", "3989-07-14 00:00:00.456789"])"); - auto heap_timestamp_array_ns = SliceArrayFromJSON( - timestamp(TimeUnit::NANO), R"([null, "1900-02-28 12:34:56.987654321"])"); - - auto heap_decimal128_array = SliceArrayFromJSON( - decimal128(30, 6), - R"([null, "-1234567890123456789.012345", "1234567890123456789.012345"])"); - auto heap_decimal256_array = SliceArrayFromJSON( - decimal256(50, 6), R"([null, "-123456789012345678901234567890123456789.012345"])"); - auto heap_decimal128_array_sliced = heap_decimal128_array->Slice(1, 1); - - auto heap_fixed_size_binary_array = - SliceArrayFromJSON(fixed_size_binary(3), "[null, \"abc\", \"\\u0000\\u001f\xff\"]"); - auto heap_fixed_size_binary_array_zero_width = - SliceArrayFromJSON(fixed_size_binary(0), R"([null, ""])"); - auto heap_fixed_size_binary_array_sliced = heap_fixed_size_binary_array->Slice(1, 1); - - const char* json_binary_array = "[null, \"abcd\", \"\\u0000\\u001f\xff\"]"; - auto heap_binary_array = SliceArrayFromJSON(binary(), json_binary_array); - auto heap_large_binary_array = SliceArrayFromJSON(large_binary(), json_binary_array); - const char* json_string_array = "[null, \"héhé\", \"invalid \xff char\"]"; - auto heap_string_array = SliceArrayFromJSON(utf8(), json_string_array); - auto heap_large_string_array = SliceArrayFromJSON(large_utf8(), json_string_array); - auto heap_binary_array_sliced = heap_binary_array->Slice(1, 1); - - // ChunkedArray - ArrayVector array_chunks(2); - array_chunks[0] = *ArrayFromJSON(int32(), "[1, 2]"); - array_chunks[1] = *ArrayFromJSON(int32(), "[3, null, 4]"); - ChunkedArray chunked_array{array_chunks}; - - // RecordBatch - auto batch_schema = schema({field("ints", int32()), field("strs", utf8())}); - ArrayVector batch_columns{2}; - batch_columns[0] = *ArrayFromJSON(int32(), "[1, 2, 3]"); - batch_columns[1] = *ArrayFromJSON(utf8(), R"(["abc", null, "def"])"); - auto batch = RecordBatch::Make(batch_schema, /*num_rows=*/3, batch_columns); - auto batch_with_metadata = batch->ReplaceSchemaMetadata( - key_value_metadata({"key1", "key2", "key3"}, {"value1", "value2", "value3"})); - - // Table - ChunkedArrayVector table_columns{2}; - ARROW_CHECK_OK( - ChunkedArrayFromJSON(int32(), {"[1, 2, 3]", "[4, 5]"}, &table_columns[0])); - ARROW_CHECK_OK(ChunkedArrayFromJSON( - utf8(), {R"(["abc", null])", R"(["def"])", R"(["ghi", "jkl"])"}, - &table_columns[1])); - auto table = Table::Make(batch_schema, table_columns); - - // Datum - Datum empty_datum{}; - Datum scalar_datum{MakeNullScalar(boolean())}; - Datum array_datum{heap_int32_array}; - Datum chunked_array_datum{chunked_array}; - Datum batch_datum{batch}; - Datum table_datum{table}; - -#ifdef __clang__ - _Pragma("clang diagnostic pop"); -#elif defined(__GNUC__) - _Pragma("GCC diagnostic pop"); -#endif - - // Hook into debugger - ::arrow::internal::DebugTrap(); -} - -} // namespace gdb -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/gdb.h b/src/vendored/apache-arrow-12.0.1/arrow/python/gdb.h deleted file mode 100644 index 1ddcbb5..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/gdb.h +++ /dev/null @@ -1,29 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/python/visibility.h" - -namespace arrow { -namespace gdb { - -ARROW_PYTHON_EXPORT -void TestSession(); - -} // namespace gdb -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.cc deleted file mode 100644 index c266abc..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.cc +++ /dev/null @@ -1,470 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// helpers.h includes a NumPy header, so we include this first -#include "arrow/python/numpy_interop.h" - -#include "arrow/python/helpers.h" - -#include -#include -#include -#include - -#include "arrow/python/common.h" -#include "arrow/python/decimal.h" -#include "arrow/type_fwd.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/logging.h" - -namespace arrow { - -using internal::checked_cast; - -namespace py { - -#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \ - case Type::NAME: \ - return FACTORY() - -std::shared_ptr GetPrimitiveType(Type::type type) { - switch (type) { - case Type::NA: - return null(); - GET_PRIMITIVE_TYPE(UINT8, uint8); - GET_PRIMITIVE_TYPE(INT8, int8); - GET_PRIMITIVE_TYPE(UINT16, uint16); - GET_PRIMITIVE_TYPE(INT16, int16); - GET_PRIMITIVE_TYPE(UINT32, uint32); - GET_PRIMITIVE_TYPE(INT32, int32); - GET_PRIMITIVE_TYPE(UINT64, uint64); - GET_PRIMITIVE_TYPE(INT64, int64); - GET_PRIMITIVE_TYPE(DATE32, date32); - GET_PRIMITIVE_TYPE(DATE64, date64); - GET_PRIMITIVE_TYPE(BOOL, boolean); - GET_PRIMITIVE_TYPE(HALF_FLOAT, float16); - GET_PRIMITIVE_TYPE(FLOAT, float32); - GET_PRIMITIVE_TYPE(DOUBLE, float64); - GET_PRIMITIVE_TYPE(BINARY, binary); - GET_PRIMITIVE_TYPE(STRING, utf8); - GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary); - GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8); - GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval); - default: - return nullptr; - } -} - -PyObject* PyHalf_FromHalf(npy_half value) { - PyObject* result = PyArrayScalar_New(Half); - if (result != NULL) { - PyArrayScalar_ASSIGN(result, Half, value); - } - return result; -} - -Status PyFloat_AsHalf(PyObject* obj, npy_half* out) { - if (PyArray_IsScalar(obj, Half)) { - *out = PyArrayScalar_VAL(obj, Half); - return Status::OK(); - } else { - // XXX: cannot use npy_double_to_half() without linking with Numpy - return Status::TypeError("Expected np.float16 instance"); - } -} - -namespace internal { - -std::string PyBytes_AsStdString(PyObject* obj) { - DCHECK(PyBytes_Check(obj)); - return std::string(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj)); -} - -Status PyUnicode_AsStdString(PyObject* obj, std::string* out) { - DCHECK(PyUnicode_Check(obj)); - Py_ssize_t size; - // The utf-8 representation is cached on the unicode object - const char* data = PyUnicode_AsUTF8AndSize(obj, &size); - RETURN_IF_PYERROR(); - *out = std::string(data, size); - return Status::OK(); -} - -std::string PyObject_StdStringRepr(PyObject* obj) { - OwnedRef unicode_ref(PyObject_Repr(obj)); - OwnedRef bytes_ref; - - if (unicode_ref) { - bytes_ref.reset( - PyUnicode_AsEncodedString(unicode_ref.obj(), "utf8", "backslashreplace")); - } - if (!bytes_ref) { - PyErr_Clear(); - std::stringstream ss; - ss << "tp_name << "' repr() failed>"; - return ss.str(); - } - return PyBytes_AsStdString(bytes_ref.obj()); -} - -Status PyObject_StdStringStr(PyObject* obj, std::string* out) { - OwnedRef string_ref(PyObject_Str(obj)); - RETURN_IF_PYERROR(); - return PyUnicode_AsStdString(string_ref.obj(), out); -} - -Result IsModuleImported(const std::string& module_name) { - // PyImport_GetModuleDict returns with a borrowed reference - OwnedRef key(PyUnicode_FromString(module_name.c_str())); - auto is_imported = PyDict_Contains(PyImport_GetModuleDict(), key.obj()); - RETURN_IF_PYERROR(); - return is_imported; -} - -Status ImportModule(const std::string& module_name, OwnedRef* ref) { - PyObject* module = PyImport_ImportModule(module_name.c_str()); - RETURN_IF_PYERROR(); - ref->reset(module); - return Status::OK(); -} - -Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref) { - PyObject* attr = PyObject_GetAttrString(module, name.c_str()); - RETURN_IF_PYERROR(); - ref->reset(attr); - return Status::OK(); -} - -namespace { - -Status IntegerOverflowStatus(PyObject* obj, const std::string& overflow_message) { - if (overflow_message.empty()) { - std::string obj_as_stdstring; - RETURN_NOT_OK(PyObject_StdStringStr(obj, &obj_as_stdstring)); - return Status::Invalid("Value ", obj_as_stdstring, - " too large to fit in C integer type"); - } else { - return Status::Invalid(overflow_message); - } -} - -Result PyObjectToPyInt(PyObject* obj) { - // Try to call __index__ or __int__ on `obj` - // (starting from Python 3.10, the latter isn't done anymore by PyLong_AsLong*). - OwnedRef ref(PyNumber_Index(obj)); - if (ref) { - return std::move(ref); - } - PyErr_Clear(); - const auto nb = Py_TYPE(obj)->tp_as_number; - if (nb && nb->nb_int) { - ref.reset(nb->nb_int(obj)); - if (!ref) { - RETURN_IF_PYERROR(); - } - DCHECK(ref); - return std::move(ref); - } - return Status::TypeError( - "object of type ", - PyObject_StdStringRepr(reinterpret_cast(Py_TYPE(obj))), - " cannot be converted to int"); -} - -// Extract C signed int from Python object -template ::value, Int> = 0> -Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) { - static_assert(sizeof(Int) <= sizeof(long long), // NOLINT - "integer type larger than long long"); - - OwnedRef ref; - if (!PyLong_Check(obj)) { - ARROW_ASSIGN_OR_RAISE(ref, PyObjectToPyInt(obj)); - obj = ref.obj(); - } - - if (sizeof(Int) > sizeof(long)) { // NOLINT - const auto value = PyLong_AsLongLong(obj); - if (ARROW_PREDICT_FALSE(value == -1)) { - RETURN_IF_PYERROR(); - } - if (ARROW_PREDICT_FALSE(value < std::numeric_limits::min() || - value > std::numeric_limits::max())) { - return IntegerOverflowStatus(obj, overflow_message); - } - *out = static_cast(value); - } else { - const auto value = PyLong_AsLong(obj); - if (ARROW_PREDICT_FALSE(value == -1)) { - RETURN_IF_PYERROR(); - } - if (ARROW_PREDICT_FALSE(value < std::numeric_limits::min() || - value > std::numeric_limits::max())) { - return IntegerOverflowStatus(obj, overflow_message); - } - *out = static_cast(value); - } - return Status::OK(); -} - -// Extract C unsigned int from Python object -template ::value, Int> = 0> -Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) { - static_assert(sizeof(Int) <= sizeof(unsigned long long), // NOLINT - "integer type larger than unsigned long long"); - - OwnedRef ref; - if (!PyLong_Check(obj)) { - ARROW_ASSIGN_OR_RAISE(ref, PyObjectToPyInt(obj)); - obj = ref.obj(); - } - - if (sizeof(Int) > sizeof(unsigned long)) { // NOLINT - const auto value = PyLong_AsUnsignedLongLong(obj); - if (ARROW_PREDICT_FALSE(value == static_cast(-1))) { - RETURN_IF_PYERROR(); - } - if (ARROW_PREDICT_FALSE(value > std::numeric_limits::max())) { - return IntegerOverflowStatus(obj, overflow_message); - } - *out = static_cast(value); - } else { - const auto value = PyLong_AsUnsignedLong(obj); - if (ARROW_PREDICT_FALSE(value == static_cast(-1))) { - RETURN_IF_PYERROR(); - } - if (ARROW_PREDICT_FALSE(value > std::numeric_limits::max())) { - return IntegerOverflowStatus(obj, overflow_message); - } - *out = static_cast(value); - } - return Status::OK(); -} - -} // namespace - -template -Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message) { - if (PyBool_Check(obj)) { - return Status::TypeError("Expected integer, got bool"); - } - return CIntFromPythonImpl(obj, out, overflow_message); -} - -template Status CIntFromPython(PyObject*, int8_t*, const std::string&); -template Status CIntFromPython(PyObject*, int16_t*, const std::string&); -template Status CIntFromPython(PyObject*, int32_t*, const std::string&); -template Status CIntFromPython(PyObject*, int64_t*, const std::string&); -template Status CIntFromPython(PyObject*, uint8_t*, const std::string&); -template Status CIntFromPython(PyObject*, uint16_t*, const std::string&); -template Status CIntFromPython(PyObject*, uint32_t*, const std::string&); -template Status CIntFromPython(PyObject*, uint64_t*, const std::string&); - -inline bool MayHaveNaN(PyObject* obj) { - // Some core types can be very quickly type-checked and do not allow NaN values - const int64_t non_nan_tpflags = Py_TPFLAGS_LONG_SUBCLASS | Py_TPFLAGS_LIST_SUBCLASS | - Py_TPFLAGS_TUPLE_SUBCLASS | Py_TPFLAGS_BYTES_SUBCLASS | - Py_TPFLAGS_UNICODE_SUBCLASS | Py_TPFLAGS_DICT_SUBCLASS | - Py_TPFLAGS_BASE_EXC_SUBCLASS | Py_TPFLAGS_TYPE_SUBCLASS; - return !PyType_HasFeature(Py_TYPE(obj), non_nan_tpflags); -} - -bool PyFloat_IsNaN(PyObject* obj) { - return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj)); -} - -namespace { - -static bool pandas_static_initialized = false; - -// Once initialized, these variables hold borrowed references to Pandas static data. -// We should not use OwnedRef here because Python destructors would be -// called on a finalized interpreter. -static PyObject* pandas_NA = nullptr; -static PyObject* pandas_NaT = nullptr; -static PyObject* pandas_Timedelta = nullptr; -static PyObject* pandas_Timestamp = nullptr; -static PyTypeObject* pandas_NaTType = nullptr; -static PyObject* pandas_DateOffset = nullptr; - -} // namespace - -void InitPandasStaticData() { - // NOTE: This is called with the GIL held. We needn't (and shouldn't, - // to avoid deadlocks) use an additional C++ lock (ARROW-10519). - if (pandas_static_initialized) { - return; - } - - OwnedRef pandas; - - // Import pandas - Status s = ImportModule("pandas", &pandas); - if (!s.ok()) { - return; - } - - // Since ImportModule can release the GIL, another thread could have - // already initialized the static data. - if (pandas_static_initialized) { - return; - } - OwnedRef ref; - - // set NaT sentinel and its type - if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) { - pandas_NaT = ref.obj(); - // PyObject_Type returns a new reference but we trust that pandas.NaT will - // outlive our use of this PyObject* - pandas_NaTType = Py_TYPE(ref.obj()); - } - - // retain a reference to Timedelta - if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) { - pandas_Timedelta = ref.obj(); - } - - // retain a reference to Timestamp - if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) { - pandas_Timestamp = ref.obj(); - } - - // if pandas.NA exists, retain a reference to it - if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) { - pandas_NA = ref.obj(); - } - - // Import DateOffset type - if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) { - pandas_DateOffset = ref.obj(); - } - - pandas_static_initialized = true; -} - -bool PandasObjectIsNull(PyObject* obj) { - if (!MayHaveNaN(obj)) { - return false; - } - if (obj == Py_None) { - return true; - } - if (PyFloat_IsNaN(obj) || (pandas_NA && obj == pandas_NA) || - (pandas_NaTType && PyObject_TypeCheck(obj, pandas_NaTType)) || - (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj))) { - return true; - } - return false; -} - -bool IsPandasTimedelta(PyObject* obj) { - return pandas_Timedelta && PyObject_IsInstance(obj, pandas_Timedelta); -} - -bool IsPandasTimestamp(PyObject* obj) { - return pandas_Timestamp && PyObject_IsInstance(obj, pandas_Timestamp); -} - -PyObject* BorrowPandasDataOffsetType() { return pandas_DateOffset; } - -Status InvalidValue(PyObject* obj, const std::string& why) { - auto obj_as_str = PyObject_StdStringRepr(obj); - return Status::Invalid("Could not convert ", std::move(obj_as_str), " with type ", - Py_TYPE(obj)->tp_name, ": ", why); -} - -Status InvalidType(PyObject* obj, const std::string& why) { - auto obj_as_str = PyObject_StdStringRepr(obj); - return Status::TypeError("Could not convert ", std::move(obj_as_str), " with type ", - Py_TYPE(obj)->tp_name, ": ", why); -} - -Status UnboxIntegerAsInt64(PyObject* obj, int64_t* out) { - if (PyLong_Check(obj)) { - int overflow = 0; - *out = PyLong_AsLongLongAndOverflow(obj, &overflow); - if (overflow) { - return Status::Invalid("PyLong is too large to fit int64"); - } - } else if (PyArray_IsScalar(obj, Byte)) { - *out = reinterpret_cast(obj)->obval; - } else if (PyArray_IsScalar(obj, UByte)) { - *out = reinterpret_cast(obj)->obval; - } else if (PyArray_IsScalar(obj, Short)) { - *out = reinterpret_cast(obj)->obval; - } else if (PyArray_IsScalar(obj, UShort)) { - *out = reinterpret_cast(obj)->obval; - } else if (PyArray_IsScalar(obj, Int)) { - *out = reinterpret_cast(obj)->obval; - } else if (PyArray_IsScalar(obj, UInt)) { - *out = reinterpret_cast(obj)->obval; - } else if (PyArray_IsScalar(obj, Long)) { - *out = reinterpret_cast(obj)->obval; - } else if (PyArray_IsScalar(obj, ULong)) { - *out = reinterpret_cast(obj)->obval; - } else if (PyArray_IsScalar(obj, LongLong)) { - *out = reinterpret_cast(obj)->obval; - } else if (PyArray_IsScalar(obj, Int64)) { - *out = reinterpret_cast(obj)->obval; - } else if (PyArray_IsScalar(obj, ULongLong)) { - *out = reinterpret_cast(obj)->obval; - } else if (PyArray_IsScalar(obj, UInt64)) { - *out = reinterpret_cast(obj)->obval; - } else { - return Status::Invalid("Integer scalar type not recognized"); - } - return Status::OK(); -} - -Status IntegerScalarToDoubleSafe(PyObject* obj, double* out) { - int64_t value = 0; - RETURN_NOT_OK(UnboxIntegerAsInt64(obj, &value)); - - constexpr int64_t kDoubleMax = 1LL << 53; - constexpr int64_t kDoubleMin = -(1LL << 53); - - if (value < kDoubleMin || value > kDoubleMax) { - return Status::Invalid("Integer value ", value, " is outside of the range exactly", - " representable by a IEEE 754 double precision value"); - } - *out = static_cast(value); - return Status::OK(); -} - -Status IntegerScalarToFloat32Safe(PyObject* obj, float* out) { - int64_t value = 0; - RETURN_NOT_OK(UnboxIntegerAsInt64(obj, &value)); - - constexpr int64_t kFloatMax = 1LL << 24; - constexpr int64_t kFloatMin = -(1LL << 24); - - if (value < kFloatMin || value > kFloatMax) { - return Status::Invalid("Integer value ", value, " is outside of the range exactly", - " representable by a IEEE 754 single precision value"); - } - *out = static_cast(value); - return Status::OK(); -} - -void DebugPrint(PyObject* obj) { - std::string repr = PyObject_StdStringRepr(obj); - PySys_WriteStderr("%s\n", repr.c_str()); -} - -} // namespace internal -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.h b/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.h deleted file mode 100644 index a8e5f80..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.h +++ /dev/null @@ -1,159 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/python/platform.h" - -#include -#include -#include -#include - -#include "arrow/python/numpy_interop.h" - -#include - -#include "arrow/python/visibility.h" -#include "arrow/type.h" -#include "arrow/util/macros.h" - -namespace arrow { - -namespace py { - -class OwnedRef; - -// \brief Get an arrow DataType instance from Arrow's Type::type enum -// \param[in] type One of the values of Arrow's Type::type enum -// \return A shared pointer to DataType -ARROW_PYTHON_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); - -// \brief Construct a np.float16 object from a npy_half value. -ARROW_PYTHON_EXPORT PyObject* PyHalf_FromHalf(npy_half value); - -// \brief Convert a Python object to a npy_half value. -ARROW_PYTHON_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out); - -namespace internal { - -// \brief Check that a Python module has been already imported -// \param[in] module_name The name of the module -Result IsModuleImported(const std::string& module_name); - -// \brief Import a Python module -// \param[in] module_name The name of the module -// \param[out] ref The OwnedRef containing the module PyObject* -ARROW_PYTHON_EXPORT -Status ImportModule(const std::string& module_name, OwnedRef* ref); - -// \brief Import an object from a Python module -// \param[in] module A Python module -// \param[in] name The name of the object to import -// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c -// module -ARROW_PYTHON_EXPORT -Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref); - -// \brief Check whether obj is an integer, independent of Python versions. -inline bool IsPyInteger(PyObject* obj) { return PyLong_Check(obj); } - -// \brief Import symbols from pandas that we need for various type-checking, -// like pandas.NaT or pandas.NA -void InitPandasStaticData(); - -// \brief Use pandas missing value semantics to check if a value is null -ARROW_PYTHON_EXPORT -bool PandasObjectIsNull(PyObject* obj); - -// \brief Check that obj is a pandas.Timedelta instance -ARROW_PYTHON_EXPORT -bool IsPandasTimedelta(PyObject* obj); - -// \brief Check that obj is a pandas.Timestamp instance -bool IsPandasTimestamp(PyObject* obj); - -// \brief Returned a borrowed reference to the pandas.tseries.offsets.DateOffset -PyObject* BorrowPandasDataOffsetType(); - -// \brief Check whether obj is a floating-point NaN -ARROW_PYTHON_EXPORT -bool PyFloat_IsNaN(PyObject* obj); - -inline bool IsPyBinary(PyObject* obj) { - return PyBytes_Check(obj) || PyByteArray_Check(obj) || PyMemoryView_Check(obj); -} - -// \brief Convert a Python integer into a C integer -// \param[in] obj A Python integer -// \param[out] out A pointer to a C integer to hold the result of the conversion -// \return The status of the operation -template -Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = ""); - -// \brief Convert a Python unicode string to a std::string -ARROW_PYTHON_EXPORT -Status PyUnicode_AsStdString(PyObject* obj, std::string* out); - -// \brief Convert a Python bytes object to a std::string -ARROW_PYTHON_EXPORT -std::string PyBytes_AsStdString(PyObject* obj); - -// \brief Call str() on the given object and return the result as a std::string -ARROW_PYTHON_EXPORT -Status PyObject_StdStringStr(PyObject* obj, std::string* out); - -// \brief Return the repr() of the given object (always succeeds) -ARROW_PYTHON_EXPORT -std::string PyObject_StdStringRepr(PyObject* obj); - -// \brief Cast the given size to int32_t, with error checking -inline Status CastSize(Py_ssize_t size, int32_t* out, - const char* error_msg = "Maximum size exceeded (2GB)") { - // size is assumed to be positive - if (size > std::numeric_limits::max()) { - return Status::Invalid(error_msg); - } - *out = static_cast(size); - return Status::OK(); -} - -inline Status CastSize(Py_ssize_t size, int64_t* out, const char* error_msg = NULLPTR) { - // size is assumed to be positive - *out = static_cast(size); - return Status::OK(); -} - -// \brief Print the Python object's __str__ form along with the passed error -// message -ARROW_PYTHON_EXPORT -Status InvalidValue(PyObject* obj, const std::string& why); - -ARROW_PYTHON_EXPORT -Status InvalidType(PyObject* obj, const std::string& why); - -ARROW_PYTHON_EXPORT -Status IntegerScalarToDoubleSafe(PyObject* obj, double* result); -ARROW_PYTHON_EXPORT -Status IntegerScalarToFloat32Safe(PyObject* obj, float* result); - -// \brief Print Python object __repr__ -void DebugPrint(PyObject* obj); - -} // namespace internal -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/inference.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/inference.cc deleted file mode 100644 index 3407b32..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/inference.cc +++ /dev/null @@ -1,748 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/inference.h" -#include "arrow/python/numpy_interop.h" - -#include - -#include -#include -#include -#include -#include -#include - -#include "arrow/scalar.h" -#include "arrow/status.h" -#include "arrow/util/decimal.h" -#include "arrow/util/logging.h" - -#include "arrow/python/datetime.h" -#include "arrow/python/decimal.h" -#include "arrow/python/helpers.h" -#include "arrow/python/iterators.h" -#include "arrow/python/numpy_convert.h" - -namespace arrow { -namespace py { -namespace { -// Assigns a tuple to interval_types_tuple containing the nametuple for -// MonthDayNanoIntervalType and if present dateutil's relativedelta and -// pandas DateOffset. -Status ImportPresentIntervalTypes(OwnedRefNoGIL* interval_types_tuple) { - OwnedRef relative_delta_module; - // These are Optional imports so swallow errors. - OwnedRef relative_delta_type; - // Try to import pandas to get types. - internal::InitPandasStaticData(); - if (internal::ImportModule("dateutil.relativedelta", &relative_delta_module).ok()) { - RETURN_NOT_OK(internal::ImportFromModule(relative_delta_module.obj(), "relativedelta", - &relative_delta_type)); - } - - PyObject* date_offset_type = internal::BorrowPandasDataOffsetType(); - interval_types_tuple->reset( - PyTuple_New(1 + (date_offset_type != nullptr ? 1 : 0) + - (relative_delta_type.obj() != nullptr ? 1 : 0))); - RETURN_IF_PYERROR(); - int index = 0; - PyTuple_SetItem(interval_types_tuple->obj(), index++, - internal::NewMonthDayNanoTupleType()); - RETURN_IF_PYERROR(); - if (date_offset_type != nullptr) { - Py_XINCREF(date_offset_type); - PyTuple_SetItem(interval_types_tuple->obj(), index++, date_offset_type); - RETURN_IF_PYERROR(); - } - if (relative_delta_type.obj() != nullptr) { - PyTuple_SetItem(interval_types_tuple->obj(), index++, relative_delta_type.detach()); - RETURN_IF_PYERROR(); - } - return Status::OK(); -} - -} // namespace - -#define _NUMPY_UNIFY_NOOP(DTYPE) \ - case NPY_##DTYPE: \ - return OK; - -#define _NUMPY_UNIFY_PROMOTE(DTYPE) \ - case NPY_##DTYPE: \ - current_type_num_ = dtype; \ - current_dtype_ = descr; \ - return OK; - -#define _NUMPY_UNIFY_PROMOTE_TO(DTYPE, NEW_TYPE) \ - case NPY_##DTYPE: \ - current_type_num_ = NPY_##NEW_TYPE; \ - current_dtype_ = PyArray_DescrFromType(current_type_num_); \ - return OK; - -// Form a consensus NumPy dtype to use for Arrow conversion for a -// collection of dtype objects observed one at a time -class NumPyDtypeUnifier { - public: - enum Action { OK, INVALID }; - - NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(nullptr) {} - - Status InvalidMix(int new_dtype) { - return Status::Invalid("Cannot mix NumPy dtypes ", - GetNumPyTypeName(current_type_num_), " and ", - GetNumPyTypeName(new_dtype)); - } - - int Observe_BOOL(PyArray_Descr* descr, int dtype) { return INVALID; } - - int Observe_INT8(PyArray_Descr* descr, int dtype) { - switch (dtype) { - _NUMPY_UNIFY_PROMOTE(INT16); - _NUMPY_UNIFY_PROMOTE(INT32); - _NUMPY_UNIFY_PROMOTE(INT64); - _NUMPY_UNIFY_PROMOTE(FLOAT32); - _NUMPY_UNIFY_PROMOTE(FLOAT64); - default: - return INVALID; - } - } - - int Observe_INT16(PyArray_Descr* descr, int dtype) { - switch (dtype) { - _NUMPY_UNIFY_NOOP(INT8); - _NUMPY_UNIFY_PROMOTE(INT32); - _NUMPY_UNIFY_PROMOTE(INT64); - _NUMPY_UNIFY_NOOP(UINT8); - _NUMPY_UNIFY_PROMOTE(FLOAT32); - _NUMPY_UNIFY_PROMOTE(FLOAT64); - default: - return INVALID; - } - } - - int Observe_INT32(PyArray_Descr* descr, int dtype) { - switch (dtype) { - _NUMPY_UNIFY_NOOP(INT8); - _NUMPY_UNIFY_NOOP(INT16); - _NUMPY_UNIFY_PROMOTE(INT32); - _NUMPY_UNIFY_PROMOTE(INT64); - _NUMPY_UNIFY_NOOP(UINT8); - _NUMPY_UNIFY_NOOP(UINT16); - _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64); - _NUMPY_UNIFY_PROMOTE(FLOAT64); - default: - return INVALID; - } - } - - int Observe_INT64(PyArray_Descr* descr, int dtype) { - switch (dtype) { - _NUMPY_UNIFY_NOOP(INT8); - _NUMPY_UNIFY_NOOP(INT16); - _NUMPY_UNIFY_NOOP(INT32); - _NUMPY_UNIFY_NOOP(INT64); - _NUMPY_UNIFY_NOOP(UINT8); - _NUMPY_UNIFY_NOOP(UINT16); - _NUMPY_UNIFY_NOOP(UINT32); - _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64); - _NUMPY_UNIFY_PROMOTE(FLOAT64); - default: - return INVALID; - } - } - - int Observe_UINT8(PyArray_Descr* descr, int dtype) { - switch (dtype) { - _NUMPY_UNIFY_PROMOTE(UINT16); - _NUMPY_UNIFY_PROMOTE(UINT32); - _NUMPY_UNIFY_PROMOTE(UINT64); - _NUMPY_UNIFY_PROMOTE(FLOAT32); - _NUMPY_UNIFY_PROMOTE(FLOAT64); - default: - return INVALID; - } - } - - int Observe_UINT16(PyArray_Descr* descr, int dtype) { - switch (dtype) { - _NUMPY_UNIFY_NOOP(UINT8); - _NUMPY_UNIFY_PROMOTE(UINT32); - _NUMPY_UNIFY_PROMOTE(UINT64); - _NUMPY_UNIFY_PROMOTE(FLOAT32); - _NUMPY_UNIFY_PROMOTE(FLOAT64); - default: - return INVALID; - } - } - - int Observe_UINT32(PyArray_Descr* descr, int dtype) { - switch (dtype) { - _NUMPY_UNIFY_NOOP(UINT8); - _NUMPY_UNIFY_NOOP(UINT16); - _NUMPY_UNIFY_PROMOTE(UINT64); - _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64); - _NUMPY_UNIFY_PROMOTE(FLOAT64); - default: - return INVALID; - } - } - - int Observe_UINT64(PyArray_Descr* descr, int dtype) { - switch (dtype) { - _NUMPY_UNIFY_NOOP(UINT8); - _NUMPY_UNIFY_NOOP(UINT16); - _NUMPY_UNIFY_NOOP(UINT32); - _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64); - _NUMPY_UNIFY_PROMOTE(FLOAT64); - default: - return INVALID; - } - } - - int Observe_FLOAT16(PyArray_Descr* descr, int dtype) { - switch (dtype) { - _NUMPY_UNIFY_PROMOTE(FLOAT32); - _NUMPY_UNIFY_PROMOTE(FLOAT64); - default: - return INVALID; - } - } - - int Observe_FLOAT32(PyArray_Descr* descr, int dtype) { - switch (dtype) { - _NUMPY_UNIFY_NOOP(INT8); - _NUMPY_UNIFY_NOOP(INT16); - _NUMPY_UNIFY_NOOP(INT32); - _NUMPY_UNIFY_NOOP(INT64); - _NUMPY_UNIFY_NOOP(UINT8); - _NUMPY_UNIFY_NOOP(UINT16); - _NUMPY_UNIFY_NOOP(UINT32); - _NUMPY_UNIFY_NOOP(UINT64); - _NUMPY_UNIFY_PROMOTE(FLOAT64); - default: - return INVALID; - } - } - - int Observe_FLOAT64(PyArray_Descr* descr, int dtype) { - switch (dtype) { - _NUMPY_UNIFY_NOOP(INT8); - _NUMPY_UNIFY_NOOP(INT16); - _NUMPY_UNIFY_NOOP(INT32); - _NUMPY_UNIFY_NOOP(INT64); - _NUMPY_UNIFY_NOOP(UINT8); - _NUMPY_UNIFY_NOOP(UINT16); - _NUMPY_UNIFY_NOOP(UINT32); - _NUMPY_UNIFY_NOOP(UINT64); - default: - return INVALID; - } - } - - int Observe_DATETIME(PyArray_Descr* dtype_obj) { - // TODO: check that units are all the same - return OK; - } - - Status Observe(PyArray_Descr* descr) { - int dtype = fix_numpy_type_num(descr->type_num); - - if (current_type_num_ == -1) { - current_dtype_ = descr; - current_type_num_ = dtype; - return Status::OK(); - } else if (current_type_num_ == dtype) { - return Status::OK(); - } - -#define OBSERVE_CASE(DTYPE) \ - case NPY_##DTYPE: \ - action = Observe_##DTYPE(descr, dtype); \ - break; - - int action = OK; - switch (current_type_num_) { - OBSERVE_CASE(BOOL); - OBSERVE_CASE(INT8); - OBSERVE_CASE(INT16); - OBSERVE_CASE(INT32); - OBSERVE_CASE(INT64); - OBSERVE_CASE(UINT8); - OBSERVE_CASE(UINT16); - OBSERVE_CASE(UINT32); - OBSERVE_CASE(UINT64); - OBSERVE_CASE(FLOAT16); - OBSERVE_CASE(FLOAT32); - OBSERVE_CASE(FLOAT64); - case NPY_DATETIME: - action = Observe_DATETIME(descr); - break; - default: - return Status::NotImplemented("Unsupported numpy type ", GetNumPyTypeName(dtype)); - } - - if (action == INVALID) { - return InvalidMix(dtype); - } - return Status::OK(); - } - - bool dtype_was_observed() const { return current_type_num_ != -1; } - - PyArray_Descr* current_dtype() const { return current_dtype_; } - - int current_type_num() const { return current_type_num_; } - - private: - int current_type_num_; - PyArray_Descr* current_dtype_; -}; - -class TypeInferrer { - // A type inference visitor for Python values - public: - // \param validate_interval the number of elements to observe before checking - // whether the data is mixed type or has other problems. This helps avoid - // excess computation for each element while also making sure we "bail out" - // early with long sequences that may have problems up front - // \param make_unions permit mixed-type data by creating union types (not yet - // implemented) - explicit TypeInferrer(bool pandas_null_sentinels = false, - int64_t validate_interval = 100, bool make_unions = false) - : pandas_null_sentinels_(pandas_null_sentinels), - validate_interval_(validate_interval), - make_unions_(make_unions), - total_count_(0), - none_count_(0), - bool_count_(0), - int_count_(0), - date_count_(0), - time_count_(0), - timestamp_micro_count_(0), - duration_count_(0), - float_count_(0), - binary_count_(0), - unicode_count_(0), - decimal_count_(0), - list_count_(0), - struct_count_(0), - arrow_scalar_count_(0), - numpy_dtype_count_(0), - interval_count_(0), - max_decimal_metadata_(std::numeric_limits::min(), - std::numeric_limits::min()), - decimal_type_() { - ARROW_CHECK_OK(internal::ImportDecimalType(&decimal_type_)); - ARROW_CHECK_OK(ImportPresentIntervalTypes(&interval_types_)); - } - - /// \param[in] obj a Python object in the sequence - /// \param[out] keep_going if sufficient information has been gathered to - /// attempt to begin converting the sequence, *keep_going will be set to true - /// to signal to the calling visitor loop to terminate - Status Visit(PyObject* obj, bool* keep_going) { - ++total_count_; - - if (obj == Py_None || (pandas_null_sentinels_ && internal::PandasObjectIsNull(obj))) { - ++none_count_; - } else if (PyBool_Check(obj)) { - ++bool_count_; - *keep_going = make_unions_; - } else if (PyFloat_Check(obj)) { - ++float_count_; - *keep_going = make_unions_; - } else if (internal::IsPyInteger(obj)) { - ++int_count_; - } else if (PyDateTime_Check(obj)) { - // infer timezone from the first encountered datetime object - if (!timestamp_micro_count_) { - OwnedRef tzinfo(PyObject_GetAttrString(obj, "tzinfo")); - if (tzinfo.obj() != nullptr && tzinfo.obj() != Py_None) { - ARROW_ASSIGN_OR_RAISE(timezone_, internal::TzinfoToString(tzinfo.obj())); - } - } - ++timestamp_micro_count_; - *keep_going = make_unions_; - } else if (PyDelta_Check(obj)) { - ++duration_count_; - *keep_going = make_unions_; - } else if (PyDate_Check(obj)) { - ++date_count_; - *keep_going = make_unions_; - } else if (PyTime_Check(obj)) { - ++time_count_; - *keep_going = make_unions_; - } else if (internal::IsPyBinary(obj)) { - ++binary_count_; - *keep_going = make_unions_; - } else if (PyUnicode_Check(obj)) { - ++unicode_count_; - *keep_going = make_unions_; - } else if (arrow::py::is_scalar(obj)) { - RETURN_NOT_OK(VisitArrowScalar(obj, keep_going)); - } else if (PyArray_CheckAnyScalarExact(obj)) { - RETURN_NOT_OK(VisitDType(PyArray_DescrFromScalar(obj), keep_going)); - } else if (PySet_Check(obj) || (Py_TYPE(obj) == &PyDictValues_Type)) { - RETURN_NOT_OK(VisitSet(obj, keep_going)); - } else if (PyArray_Check(obj)) { - RETURN_NOT_OK(VisitNdarray(obj, keep_going)); - } else if (PyDict_Check(obj)) { - RETURN_NOT_OK(VisitDict(obj)); - } else if (PyList_Check(obj) || - (PyTuple_Check(obj) && - !PyObject_IsInstance(obj, PyTuple_GetItem(interval_types_.obj(), 0)))) { - RETURN_NOT_OK(VisitList(obj, keep_going)); - } else if (PyObject_IsInstance(obj, decimal_type_.obj())) { - RETURN_NOT_OK(max_decimal_metadata_.Update(obj)); - ++decimal_count_; - } else if (PyObject_IsInstance(obj, interval_types_.obj())) { - ++interval_count_; - } else { - return internal::InvalidValue(obj, - "did not recognize Python value type when inferring " - "an Arrow data type"); - } - - if (total_count_ % validate_interval_ == 0) { - RETURN_NOT_OK(Validate()); - } - - return Status::OK(); - } - - // Infer value type from a sequence of values - Status VisitSequence(PyObject* obj, PyObject* mask = nullptr) { - if (mask == nullptr || mask == Py_None) { - return internal::VisitSequence( - obj, /*offset=*/0, - [this](PyObject* value, bool* keep_going) { return Visit(value, keep_going); }); - } else { - return internal::VisitSequenceMasked( - obj, mask, /*offset=*/0, - [this](PyObject* value, uint8_t masked, bool* keep_going) { - if (!masked) { - return Visit(value, keep_going); - } else { - return Status::OK(); - } - }); - } - } - - // Infer value type from a sequence of values - Status VisitIterable(PyObject* obj) { - return internal::VisitIterable(obj, [this](PyObject* value, bool* keep_going) { - return Visit(value, keep_going); - }); - } - - Status GetType(std::shared_ptr* out) { - // TODO(wesm): handling forming unions - if (make_unions_) { - return Status::NotImplemented("Creating union types not yet supported"); - } - - RETURN_NOT_OK(Validate()); - - if (arrow_scalar_count_ > 0 && arrow_scalar_count_ + none_count_ != total_count_) { - return Status::Invalid( - "pyarrow scalars cannot be mixed " - "with other Python scalar values currently"); - } - - if (numpy_dtype_count_ > 0) { - // All NumPy scalars and Nones/nulls - if (numpy_dtype_count_ + none_count_ == total_count_) { - std::shared_ptr type; - RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), &type)); - *out = type; - return Status::OK(); - } - - // The "bad path": data contains a mix of NumPy scalars and - // other kinds of scalars. Note this can happen innocuously - // because numpy.nan is not a NumPy scalar (it's a built-in - // PyFloat) - - // TODO(ARROW-5564): Merge together type unification so this - // hack is not necessary - switch (numpy_unifier_.current_type_num()) { - case NPY_BOOL: - bool_count_ += numpy_dtype_count_; - break; - case NPY_INT8: - case NPY_INT16: - case NPY_INT32: - case NPY_INT64: - case NPY_UINT8: - case NPY_UINT16: - case NPY_UINT32: - case NPY_UINT64: - int_count_ += numpy_dtype_count_; - break; - case NPY_FLOAT32: - case NPY_FLOAT64: - float_count_ += numpy_dtype_count_; - break; - case NPY_DATETIME: - return Status::Invalid( - "numpy.datetime64 scalars cannot be mixed " - "with other Python scalar values currently"); - } - } - - if (list_count_) { - std::shared_ptr value_type; - RETURN_NOT_OK(list_inferrer_->GetType(&value_type)); - *out = list(value_type); - } else if (struct_count_) { - RETURN_NOT_OK(GetStructType(out)); - } else if (decimal_count_) { - if (max_decimal_metadata_.precision() > Decimal128Type::kMaxPrecision) { - // the default constructor does not validate the precision and scale - ARROW_ASSIGN_OR_RAISE(*out, - Decimal256Type::Make(max_decimal_metadata_.precision(), - max_decimal_metadata_.scale())); - } else { - ARROW_ASSIGN_OR_RAISE(*out, - Decimal128Type::Make(max_decimal_metadata_.precision(), - max_decimal_metadata_.scale())); - } - } else if (float_count_) { - // Prioritize floats before integers - *out = float64(); - } else if (int_count_) { - *out = int64(); - } else if (date_count_) { - *out = date32(); - } else if (time_count_) { - *out = time64(TimeUnit::MICRO); - } else if (timestamp_micro_count_) { - *out = timestamp(TimeUnit::MICRO, timezone_); - } else if (duration_count_) { - *out = duration(TimeUnit::MICRO); - } else if (bool_count_) { - *out = boolean(); - } else if (binary_count_) { - *out = binary(); - } else if (unicode_count_) { - *out = utf8(); - } else if (interval_count_) { - *out = month_day_nano_interval(); - } else if (arrow_scalar_count_) { - *out = scalar_type_; - } else { - *out = null(); - } - return Status::OK(); - } - - int64_t total_count() const { return total_count_; } - - protected: - Status Validate() const { - if (list_count_ > 0) { - if (list_count_ + none_count_ != total_count_) { - return Status::Invalid("cannot mix list and non-list, non-null values"); - } - RETURN_NOT_OK(list_inferrer_->Validate()); - } else if (struct_count_ > 0) { - if (struct_count_ + none_count_ != total_count_) { - return Status::Invalid("cannot mix struct and non-struct, non-null values"); - } - for (const auto& it : struct_inferrers_) { - RETURN_NOT_OK(it.second.Validate()); - } - } - return Status::OK(); - } - - Status VisitArrowScalar(PyObject* obj, bool* keep_going /* unused */) { - ARROW_ASSIGN_OR_RAISE(auto scalar, arrow::py::unwrap_scalar(obj)); - // Check that all the scalar types for the sequence are the same - if (arrow_scalar_count_ > 0 && *scalar->type != *scalar_type_) { - return internal::InvalidValue(obj, "cannot mix scalars with different types"); - } - scalar_type_ = scalar->type; - ++arrow_scalar_count_; - return Status::OK(); - } - - Status VisitDType(PyArray_Descr* dtype, bool* keep_going) { - // Continue visiting dtypes for now. - // TODO(wesm): devise approach for unions - ++numpy_dtype_count_; - *keep_going = true; - return numpy_unifier_.Observe(dtype); - } - - Status VisitList(PyObject* obj, bool* keep_going /* unused */) { - if (!list_inferrer_) { - list_inferrer_.reset( - new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_)); - } - ++list_count_; - return list_inferrer_->VisitSequence(obj); - } - - Status VisitSet(PyObject* obj, bool* keep_going /* unused */) { - if (!list_inferrer_) { - list_inferrer_.reset( - new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_)); - } - ++list_count_; - return list_inferrer_->VisitIterable(obj); - } - - Status VisitNdarray(PyObject* obj, bool* keep_going) { - PyArray_Descr* dtype = PyArray_DESCR(reinterpret_cast(obj)); - if (dtype->type_num == NPY_OBJECT) { - return VisitList(obj, keep_going); - } - // Not an object array: infer child Arrow type from dtype - if (!list_inferrer_) { - list_inferrer_.reset( - new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_)); - } - ++list_count_; - - // XXX(wesm): In ARROW-4324 I added accounting to check whether - // all of the non-null values have NumPy dtypes, but the - // total_count not not being properly incremented here - ++(*list_inferrer_).total_count_; - return list_inferrer_->VisitDType(dtype, keep_going); - } - - Status VisitDict(PyObject* obj) { - PyObject* key_obj; - PyObject* value_obj; - Py_ssize_t pos = 0; - - while (PyDict_Next(obj, &pos, &key_obj, &value_obj)) { - std::string key; - if (PyUnicode_Check(key_obj)) { - RETURN_NOT_OK(internal::PyUnicode_AsStdString(key_obj, &key)); - } else if (PyBytes_Check(key_obj)) { - key = internal::PyBytes_AsStdString(key_obj); - } else { - return Status::TypeError("Expected dict key of type str or bytes, got '", - Py_TYPE(key_obj)->tp_name, "'"); - } - // Get or create visitor for this key - auto it = struct_inferrers_.find(key); - if (it == struct_inferrers_.end()) { - it = struct_inferrers_ - .insert( - std::make_pair(key, TypeInferrer(pandas_null_sentinels_, - validate_interval_, make_unions_))) - .first; - } - TypeInferrer* visitor = &it->second; - - // We ignore termination signals from child visitors for now - // - // TODO(wesm): keep track of whether type inference has terminated for - // the child visitors to avoid doing unneeded work - bool keep_going = true; - RETURN_NOT_OK(visitor->Visit(value_obj, &keep_going)); - } - - // We do not terminate visiting dicts since we want the union of all - // observed keys - ++struct_count_; - return Status::OK(); - } - - Status GetStructType(std::shared_ptr* out) { - std::vector> fields; - for (auto&& it : struct_inferrers_) { - std::shared_ptr field_type; - RETURN_NOT_OK(it.second.GetType(&field_type)); - fields.emplace_back(field(it.first, field_type)); - } - *out = struct_(fields); - return Status::OK(); - } - - private: - bool pandas_null_sentinels_; - int64_t validate_interval_; - bool make_unions_; - int64_t total_count_; - int64_t none_count_; - int64_t bool_count_; - int64_t int_count_; - int64_t date_count_; - int64_t time_count_; - int64_t timestamp_micro_count_; - std::string timezone_; - int64_t duration_count_; - int64_t float_count_; - int64_t binary_count_; - int64_t unicode_count_; - int64_t decimal_count_; - int64_t list_count_; - int64_t struct_count_; - int64_t arrow_scalar_count_; - int64_t numpy_dtype_count_; - int64_t interval_count_; - std::unique_ptr list_inferrer_; - std::map struct_inferrers_; - std::shared_ptr scalar_type_; - - // If we observe a strongly-typed value in e.g. a NumPy array, we can store - // it here to skip the type counting logic above - NumPyDtypeUnifier numpy_unifier_; - - internal::DecimalMetadata max_decimal_metadata_; - - OwnedRefNoGIL decimal_type_; - OwnedRefNoGIL interval_types_; -}; - -// Non-exhaustive type inference -Result> InferArrowType(PyObject* obj, PyObject* mask, - bool pandas_null_sentinels) { - if (pandas_null_sentinels) { - // ARROW-842: If pandas is not installed then null checks will be less - // comprehensive, but that is okay. - internal::InitPandasStaticData(); - } - - std::shared_ptr out_type; - TypeInferrer inferrer(pandas_null_sentinels); - RETURN_NOT_OK(inferrer.VisitSequence(obj, mask)); - RETURN_NOT_OK(inferrer.GetType(&out_type)); - if (out_type == nullptr) { - return Status::TypeError("Unable to determine data type"); - } else { - return std::move(out_type); - } -} - -ARROW_PYTHON_EXPORT -bool IsPyBool(PyObject* obj) { return internal::PyBoolScalar_Check(obj); } - -ARROW_PYTHON_EXPORT -bool IsPyInt(PyObject* obj) { return internal::PyIntScalar_Check(obj); } - -ARROW_PYTHON_EXPORT -bool IsPyFloat(PyObject* obj) { return internal::PyFloatScalar_Check(obj); } - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/inference.h b/src/vendored/apache-arrow-12.0.1/arrow/python/inference.h deleted file mode 100644 index 983384d..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/inference.h +++ /dev/null @@ -1,64 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between CPython built-in data structures and Arrow -// data structures - -#pragma once - -#include "arrow/python/platform.h" - -#include - -#include "arrow/python/visibility.h" -#include "arrow/type.h" -#include "arrow/util/macros.h" - -#include "common.h" - -namespace arrow { - -class Array; -class Status; - -namespace py { - -// These functions take a sequence input, not arbitrary iterables - -/// \brief Infer Arrow type from a Python sequence -/// \param[in] obj the sequence of values -/// \param[in] mask an optional mask where True values are null. May -/// be nullptr -/// \param[in] pandas_null_sentinels use pandas's null value markers -ARROW_PYTHON_EXPORT -Result> InferArrowType(PyObject* obj, PyObject* mask, - bool pandas_null_sentinels); - -/// Checks whether the passed Python object is a boolean scalar -ARROW_PYTHON_EXPORT -bool IsPyBool(PyObject* obj); - -/// Checks whether the passed Python object is an integer scalar -ARROW_PYTHON_EXPORT -bool IsPyInt(PyObject* obj); - -/// Checks whether the passed Python object is a float scalar -ARROW_PYTHON_EXPORT -bool IsPyFloat(PyObject* obj); - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/init.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/init.cc deleted file mode 100644 index dba293b..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/init.cc +++ /dev/null @@ -1,24 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Trigger the array import (inversion of NO_IMPORT_ARRAY) -#define NUMPY_IMPORT_ARRAY - -#include "arrow/python/init.h" -#include "arrow/python/numpy_interop.h" - -int arrow_init_numpy() { return arrow::py::import_numpy(); } diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/init.h b/src/vendored/apache-arrow-12.0.1/arrow/python/init.h deleted file mode 100644 index 2e6c954..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/init.h +++ /dev/null @@ -1,26 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/python/platform.h" -#include "arrow/python/visibility.h" - -extern "C" { -ARROW_PYTHON_EXPORT -int arrow_init_numpy(); -} diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/io.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/io.cc deleted file mode 100644 index 43f8297..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/io.cc +++ /dev/null @@ -1,384 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "io.h" - -#include -#include -#include -#include -#include - -#include "arrow/io/memory.h" -#include "arrow/memory_pool.h" -#include "arrow/status.h" -#include "arrow/util/logging.h" - -#include "arrow/python/common.h" -#include "arrow/python/pyarrow.h" - -namespace arrow { - -using arrow::io::TransformInputStream; - -namespace py { - -// ---------------------------------------------------------------------- -// Python file - -// A common interface to a Python file-like object. Must acquire GIL before -// calling any methods -class PythonFile { - public: - explicit PythonFile(PyObject* file) : file_(file), checked_read_buffer_(false) { - Py_INCREF(file); - } - - Status CheckClosed() const { - if (!file_) { - return Status::Invalid("operation on closed Python file"); - } - return Status::OK(); - } - - Status Close() { - if (file_) { - PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "close", "()"); - Py_XDECREF(result); - file_.reset(); - PY_RETURN_IF_ERROR(StatusCode::IOError); - } - return Status::OK(); - } - - Status Abort() { - file_.reset(); - return Status::OK(); - } - - bool closed() const { - if (!file_) { - return true; - } - PyObject* result = PyObject_GetAttrString(file_.obj(), "closed"); - if (result == NULL) { - // Can't propagate the error, so write it out and return an arbitrary value - PyErr_WriteUnraisable(NULL); - return true; - } - int ret = PyObject_IsTrue(result); - Py_XDECREF(result); - if (ret < 0) { - PyErr_WriteUnraisable(NULL); - return true; - } - return ret != 0; - } - - Status Seek(int64_t position, int whence) { - RETURN_NOT_OK(CheckClosed()); - - // whence: 0 for relative to start of file, 2 for end of file - PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(ni)", - static_cast(position), whence); - Py_XDECREF(result); - PY_RETURN_IF_ERROR(StatusCode::IOError); - return Status::OK(); - } - - Status Read(int64_t nbytes, PyObject** out) { - RETURN_NOT_OK(CheckClosed()); - - PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(n)", - static_cast(nbytes)); - PY_RETURN_IF_ERROR(StatusCode::IOError); - *out = result; - return Status::OK(); - } - - Status ReadBuffer(int64_t nbytes, PyObject** out) { - PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", "(n)", - static_cast(nbytes)); - PY_RETURN_IF_ERROR(StatusCode::IOError); - *out = result; - return Status::OK(); - } - - Status Write(const void* data, int64_t nbytes) { - RETURN_NOT_OK(CheckClosed()); - - // Since the data isn't owned, we have to make a copy - PyObject* py_data = - PyBytes_FromStringAndSize(reinterpret_cast(data), nbytes); - PY_RETURN_IF_ERROR(StatusCode::IOError); - - PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "write", "(O)", py_data); - Py_XDECREF(py_data); - Py_XDECREF(result); - PY_RETURN_IF_ERROR(StatusCode::IOError); - return Status::OK(); - } - - Status Write(const std::shared_ptr& buffer) { - RETURN_NOT_OK(CheckClosed()); - - PyObject* py_data = wrap_buffer(buffer); - PY_RETURN_IF_ERROR(StatusCode::IOError); - - PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "write", "(O)", py_data); - Py_XDECREF(py_data); - Py_XDECREF(result); - PY_RETURN_IF_ERROR(StatusCode::IOError); - return Status::OK(); - } - - Result Tell() { - RETURN_NOT_OK(CheckClosed()); - - PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "tell", "()"); - PY_RETURN_IF_ERROR(StatusCode::IOError); - - int64_t position = PyLong_AsLongLong(result); - Py_DECREF(result); - - // PyLong_AsLongLong can raise OverflowError - PY_RETURN_IF_ERROR(StatusCode::IOError); - return position; - } - - std::mutex& lock() { return lock_; } - - bool HasReadBuffer() { - if (!checked_read_buffer_) { // we don't want to check this each time - has_read_buffer_ = PyObject_HasAttrString(file_.obj(), "read_buffer") == 1; - checked_read_buffer_ = true; - } - return has_read_buffer_; - } - - private: - std::mutex lock_; - OwnedRefNoGIL file_; - bool has_read_buffer_; - bool checked_read_buffer_; -}; - -// ---------------------------------------------------------------------- -// Seekable input stream - -PyReadableFile::PyReadableFile(PyObject* file) { file_.reset(new PythonFile(file)); } - -// The destructor does not close the underlying Python file object, as -// there may be multiple references to it. Instead let the Python -// destructor do its job. -PyReadableFile::~PyReadableFile() {} - -Status PyReadableFile::Abort() { - return SafeCallIntoPython([this]() { return file_->Abort(); }); -} - -Status PyReadableFile::Close() { - return SafeCallIntoPython([this]() { return file_->Close(); }); -} - -bool PyReadableFile::closed() const { - bool res; - Status st = SafeCallIntoPython([this, &res]() { - res = file_->closed(); - return Status::OK(); - }); - return res; -} - -Status PyReadableFile::Seek(int64_t position) { - return SafeCallIntoPython([=] { return file_->Seek(position, 0); }); -} - -Result PyReadableFile::Tell() const { - return SafeCallIntoPython([=]() -> Result { return file_->Tell(); }); -} - -Result PyReadableFile::Read(int64_t nbytes, void* out) { - return SafeCallIntoPython([=]() -> Result { - OwnedRef bytes; - RETURN_NOT_OK(file_->Read(nbytes, bytes.ref())); - PyObject* bytes_obj = bytes.obj(); - DCHECK(bytes_obj != NULL); - - Py_buffer py_buf; - if (!PyObject_GetBuffer(bytes_obj, &py_buf, PyBUF_ANY_CONTIGUOUS)) { - const uint8_t* data = reinterpret_cast(py_buf.buf); - std::memcpy(out, data, py_buf.len); - int64_t len = py_buf.len; - PyBuffer_Release(&py_buf); - return len; - } else { - return Status::TypeError( - "Python file read() should have returned a bytes object or an object " - "supporting the buffer protocol, got '", - Py_TYPE(bytes_obj)->tp_name, "' (did you open the file in binary mode?)"); - } - }); -} - -Result> PyReadableFile::Read(int64_t nbytes) { - return SafeCallIntoPython([=]() -> Result> { - OwnedRef buffer_obj; - if (file_->HasReadBuffer()) { - RETURN_NOT_OK(file_->ReadBuffer(nbytes, buffer_obj.ref())); - } else { - RETURN_NOT_OK(file_->Read(nbytes, buffer_obj.ref())); - } - DCHECK(buffer_obj.obj() != NULL); - - return PyBuffer::FromPyObject(buffer_obj.obj()); - }); -} - -Result PyReadableFile::ReadAt(int64_t position, int64_t nbytes, void* out) { - std::lock_guard guard(file_->lock()); - return SafeCallIntoPython([=]() -> Result { - RETURN_NOT_OK(Seek(position)); - return Read(nbytes, out); - }); -} - -Result> PyReadableFile::ReadAt(int64_t position, int64_t nbytes) { - std::lock_guard guard(file_->lock()); - return SafeCallIntoPython([=]() -> Result> { - RETURN_NOT_OK(Seek(position)); - return Read(nbytes); - }); -} - -Result PyReadableFile::GetSize() { - return SafeCallIntoPython([=]() -> Result { - ARROW_ASSIGN_OR_RAISE(int64_t current_position, file_->Tell()); - RETURN_NOT_OK(file_->Seek(0, 2)); - - ARROW_ASSIGN_OR_RAISE(int64_t file_size, file_->Tell()); - // Restore previous file position - RETURN_NOT_OK(file_->Seek(current_position, 0)); - - return file_size; - }); -} - -// ---------------------------------------------------------------------- -// Output stream - -PyOutputStream::PyOutputStream(PyObject* file) : position_(0) { - file_.reset(new PythonFile(file)); -} - -// The destructor does not close the underlying Python file object, as -// there may be multiple references to it. Instead let the Python -// destructor do its job. -PyOutputStream::~PyOutputStream() {} - -Status PyOutputStream::Abort() { - return SafeCallIntoPython([=]() { return file_->Abort(); }); -} - -Status PyOutputStream::Close() { - return SafeCallIntoPython([=]() { return file_->Close(); }); -} - -bool PyOutputStream::closed() const { - bool res; - Status st = SafeCallIntoPython([this, &res]() { - res = file_->closed(); - return Status::OK(); - }); - return res; -} - -Result PyOutputStream::Tell() const { return position_; } - -Status PyOutputStream::Write(const void* data, int64_t nbytes) { - return SafeCallIntoPython([=]() { - position_ += nbytes; - return file_->Write(data, nbytes); - }); -} - -Status PyOutputStream::Write(const std::shared_ptr& buffer) { - return SafeCallIntoPython([=]() { - position_ += buffer->size(); - return file_->Write(buffer); - }); -} - -// ---------------------------------------------------------------------- -// Foreign buffer - -Status PyForeignBuffer::Make(const uint8_t* data, int64_t size, PyObject* base, - std::shared_ptr* out) { - PyForeignBuffer* buf = new PyForeignBuffer(data, size, base); - if (buf == NULL) { - return Status::OutOfMemory("could not allocate foreign buffer object"); - } else { - *out = std::shared_ptr(buf); - return Status::OK(); - } -} - -// ---------------------------------------------------------------------- -// TransformInputStream::TransformFunc wrapper - -struct TransformFunctionWrapper { - TransformFunctionWrapper(TransformCallback cb, PyObject* arg) - : cb_(std::move(cb)), arg_(std::make_shared(arg)) { - Py_INCREF(arg); - } - - Result> operator()(const std::shared_ptr& src) { - return SafeCallIntoPython([=]() -> Result> { - std::shared_ptr dest; - cb_(arg_->obj(), src, &dest); - RETURN_NOT_OK(CheckPyError()); - return dest; - }); - } - - protected: - // Need to wrap OwnedRefNoGIL because std::function needs the callable - // to be copy-constructible... - TransformCallback cb_; - std::shared_ptr arg_; -}; - -std::shared_ptr<::arrow::io::InputStream> MakeTransformInputStream( - std::shared_ptr<::arrow::io::InputStream> wrapped, TransformInputStreamVTable vtable, - PyObject* handler) { - TransformInputStream::TransformFunc transform( - TransformFunctionWrapper{std::move(vtable.transform), handler}); - return std::make_shared(std::move(wrapped), std::move(transform)); -} - -std::shared_ptr MakeStreamTransformFunc(TransformInputStreamVTable vtable, - PyObject* handler) { - TransformInputStream::TransformFunc transform( - TransformFunctionWrapper{std::move(vtable.transform), handler}); - StreamWrapFunc func = [transform](std::shared_ptr<::arrow::io::InputStream> wrapped) { - return std::make_shared(wrapped, transform); - }; - return std::make_shared(func); -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/io.h b/src/vendored/apache-arrow-12.0.1/arrow/python/io.h deleted file mode 100644 index 10489c1..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/io.h +++ /dev/null @@ -1,121 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/io/interfaces.h" -#include "arrow/io/transform.h" - -#include "arrow/python/common.h" -#include "arrow/python/visibility.h" - -namespace arrow { -namespace py { - -class ARROW_NO_EXPORT PythonFile; - -class ARROW_PYTHON_EXPORT PyReadableFile : public io::RandomAccessFile { - public: - explicit PyReadableFile(PyObject* file); - ~PyReadableFile() override; - - Status Close() override; - Status Abort() override; - bool closed() const override; - - Result Read(int64_t nbytes, void* out) override; - Result> Read(int64_t nbytes) override; - - // Thread-safe version - Result ReadAt(int64_t position, int64_t nbytes, void* out) override; - - // Thread-safe version - Result> ReadAt(int64_t position, int64_t nbytes) override; - - Result GetSize() override; - - Status Seek(int64_t position) override; - - Result Tell() const override; - - private: - std::unique_ptr file_; -}; - -class ARROW_PYTHON_EXPORT PyOutputStream : public io::OutputStream { - public: - explicit PyOutputStream(PyObject* file); - ~PyOutputStream() override; - - Status Close() override; - Status Abort() override; - bool closed() const override; - Result Tell() const override; - Status Write(const void* data, int64_t nbytes) override; - Status Write(const std::shared_ptr& buffer) override; - - private: - std::unique_ptr file_; - int64_t position_; -}; - -// TODO(wesm): seekable output files - -// A Buffer subclass that keeps a PyObject reference throughout its -// lifetime, such that the Python object is kept alive as long as the -// C++ buffer is still needed. -// Keeping the reference in a Python wrapper would be incorrect as -// the Python wrapper can get destroyed even though the wrapped C++ -// buffer is still alive (ARROW-2270). -class ARROW_PYTHON_EXPORT PyForeignBuffer : public Buffer { - public: - static Status Make(const uint8_t* data, int64_t size, PyObject* base, - std::shared_ptr* out); - - private: - PyForeignBuffer(const uint8_t* data, int64_t size, PyObject* base) - : Buffer(data, size) { - Py_INCREF(base); - base_.reset(base); - } - - OwnedRefNoGIL base_; -}; - -// All this rigamarole because Cython is really poor with std::function<> - -using TransformCallback = std::function& src, std::shared_ptr* out)>; - -struct TransformInputStreamVTable { - TransformCallback transform; -}; - -ARROW_PYTHON_EXPORT -std::shared_ptr<::arrow::io::InputStream> MakeTransformInputStream( - std::shared_ptr<::arrow::io::InputStream> wrapped, TransformInputStreamVTable vtable, - PyObject* arg); - -using StreamWrapFunc = std::function>( - std::shared_ptr)>; -ARROW_PYTHON_EXPORT -std::shared_ptr MakeStreamTransformFunc(TransformInputStreamVTable vtable, - PyObject* handler); -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.cc deleted file mode 100644 index 9348182..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.cc +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "ipc.h" - -#include - -#include "arrow/python/pyarrow.h" - -namespace arrow { -namespace py { - -PyRecordBatchReader::PyRecordBatchReader() {} - -Status PyRecordBatchReader::Init(std::shared_ptr schema, PyObject* iterable) { - schema_ = std::move(schema); - - iterator_.reset(PyObject_GetIter(iterable)); - return CheckPyError(); -} - -std::shared_ptr PyRecordBatchReader::schema() const { return schema_; } - -Status PyRecordBatchReader::ReadNext(std::shared_ptr* batch) { - PyAcquireGIL lock; - - if (!iterator_) { - // End of stream - batch->reset(); - return Status::OK(); - } - - OwnedRef py_batch(PyIter_Next(iterator_.obj())); - if (!py_batch) { - RETURN_IF_PYERROR(); - // End of stream - batch->reset(); - iterator_.reset(); - return Status::OK(); - } - - return unwrap_batch(py_batch.obj()).Value(batch); -} - -Result> PyRecordBatchReader::Make( - std::shared_ptr schema, PyObject* iterable) { - auto reader = std::shared_ptr(new PyRecordBatchReader()); - RETURN_NOT_OK(reader->Init(std::move(schema), iterable)); - return reader; -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.h b/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.h deleted file mode 100644 index 92232ed..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.h +++ /dev/null @@ -1,52 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/python/common.h" -#include "arrow/python/visibility.h" -#include "arrow/record_batch.h" -#include "arrow/result.h" -#include "arrow/util/macros.h" - -namespace arrow { -namespace py { - -class ARROW_PYTHON_EXPORT PyRecordBatchReader : public RecordBatchReader { - public: - std::shared_ptr schema() const override; - - Status ReadNext(std::shared_ptr* batch) override; - - // For use from Cython - // Assumes that `iterable` is borrowed - static Result> Make(std::shared_ptr, - PyObject* iterable); - - protected: - PyRecordBatchReader(); - - Status Init(std::shared_ptr, PyObject* iterable); - - std::shared_ptr schema_; - OwnedRefNoGIL iterator_; -}; - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/iterators.h b/src/vendored/apache-arrow-12.0.1/arrow/python/iterators.h deleted file mode 100644 index 7b31962..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/iterators.h +++ /dev/null @@ -1,194 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/array/array_primitive.h" - -#include "arrow/python/common.h" -#include "arrow/python/numpy_internal.h" - -namespace arrow { -namespace py { -namespace internal { - -using arrow::internal::checked_cast; - -// Visit the Python sequence, calling the given callable on each element. If -// the callable returns a non-OK status, iteration stops and the status is -// returned. -// -// The call signature for Visitor must be -// -// Visit(PyObject* obj, int64_t index, bool* keep_going) -// -// If keep_going is set to false, the iteration terminates -template -inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&& func) { - // VisitorFunc may set to false to terminate iteration - bool keep_going = true; - - if (PyArray_Check(obj)) { - PyArrayObject* arr_obj = reinterpret_cast(obj); - if (PyArray_NDIM(arr_obj) != 1) { - return Status::Invalid("Only 1D arrays accepted"); - } - - if (PyArray_DESCR(arr_obj)->type_num == NPY_OBJECT) { - // It's an array object, we can fetch object pointers directly - const Ndarray1DIndexer objects(arr_obj); - for (int64_t i = offset; keep_going && i < objects.size(); ++i) { - RETURN_NOT_OK(func(objects[i], i, &keep_going)); - } - return Status::OK(); - } - // It's a non-object array, fall back on regular sequence access. - // (note PyArray_GETITEM() is slightly different: it returns standard - // Python types, not Numpy scalar types) - // This code path is inefficient: callers should implement dedicated - // logic for non-object arrays. - } - if (PySequence_Check(obj)) { - if (PyList_Check(obj) || PyTuple_Check(obj)) { - // Use fast item access - const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj); - for (Py_ssize_t i = offset; keep_going && i < size; ++i) { - PyObject* value = PySequence_Fast_GET_ITEM(obj, i); - RETURN_NOT_OK(func(value, static_cast(i), &keep_going)); - } - } else { - // Regular sequence: avoid making a potentially large copy - const Py_ssize_t size = PySequence_Size(obj); - RETURN_IF_PYERROR(); - for (Py_ssize_t i = offset; keep_going && i < size; ++i) { - OwnedRef value_ref(PySequence_ITEM(obj, i)); - RETURN_IF_PYERROR(); - RETURN_NOT_OK(func(value_ref.obj(), static_cast(i), &keep_going)); - } - } - } else { - return Status::TypeError("Object is not a sequence"); - } - return Status::OK(); -} - -// Visit sequence with no null mask -template -inline Status VisitSequence(PyObject* obj, int64_t offset, VisitorFunc&& func) { - return VisitSequenceGeneric( - obj, offset, [&func](PyObject* value, int64_t i /* unused */, bool* keep_going) { - return func(value, keep_going); - }); -} - -/// Visit sequence with null mask -template -inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset, - VisitorFunc&& func) { - if (PyArray_Check(mo)) { - PyArrayObject* mask = reinterpret_cast(mo); - if (PyArray_NDIM(mask) != 1) { - return Status::Invalid("Mask must be 1D array"); - } - if (PyArray_SIZE(mask) != static_cast(PySequence_Size(obj))) { - return Status::Invalid("Mask was a different length from sequence being converted"); - } - - const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num); - if (dtype == NPY_BOOL) { - Ndarray1DIndexer mask_values(mask); - - return VisitSequenceGeneric( - obj, offset, - [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) { - return func(value, mask_values[i], keep_going); - }); - } else { - return Status::TypeError("Mask must be boolean dtype"); - } - } else if (py::is_array(mo)) { - auto unwrap_mask_result = unwrap_array(mo); - ARROW_RETURN_NOT_OK(unwrap_mask_result); - std::shared_ptr mask_ = unwrap_mask_result.ValueOrDie(); - if (mask_->type_id() != Type::type::BOOL) { - return Status::TypeError("Mask must be an array of booleans"); - } - - if (mask_->length() != PySequence_Size(obj)) { - return Status::Invalid("Mask was a different length from sequence being converted"); - } - - if (mask_->null_count() != 0) { - return Status::TypeError("Mask must be an array of booleans"); - } - - BooleanArray* boolmask = checked_cast(mask_.get()); - return VisitSequenceGeneric( - obj, offset, [&func, &boolmask](PyObject* value, int64_t i, bool* keep_going) { - return func(value, boolmask->Value(i), keep_going); - }); - } else if (PySequence_Check(mo)) { - if (PySequence_Size(mo) != PySequence_Size(obj)) { - return Status::Invalid("Mask was a different length from sequence being converted"); - } - RETURN_IF_PYERROR(); - - return VisitSequenceGeneric( - obj, offset, [&func, &mo](PyObject* value, int64_t i, bool* keep_going) { - OwnedRef value_ref(PySequence_ITEM(mo, i)); - if (!PyBool_Check(value_ref.obj())) - return Status::TypeError("Mask must be a sequence of booleans"); - return func(value, value_ref.obj() == Py_True, keep_going); - }); - } else { - return Status::Invalid("Null mask must be a NumPy array, Arrow array or a Sequence"); - } - - return Status::OK(); -} - -// Like IterateSequence, but accepts any generic iterable (including -// non-restartable iterators, e.g. generators). -// -// The call signature for VisitorFunc must be Visit(PyObject*, bool* -// keep_going). If keep_going is set to false, the iteration terminates -template -inline Status VisitIterable(PyObject* obj, VisitorFunc&& func) { - if (PySequence_Check(obj)) { - // Numpy arrays fall here as well - return VisitSequence(obj, /*offset=*/0, std::forward(func)); - } - // Fall back on the iterator protocol - OwnedRef iter_ref(PyObject_GetIter(obj)); - PyObject* iter = iter_ref.obj(); - RETURN_IF_PYERROR(); - PyObject* value; - - bool keep_going = true; - while (keep_going && (value = PyIter_Next(iter))) { - OwnedRef value_ref(value); - RETURN_NOT_OK(func(value_ref.obj(), &keep_going)); - } - RETURN_IF_PYERROR(); // __next__() might have raised - return Status::OK(); -} - -} // namespace internal -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/lib.h b/src/vendored/apache-arrow-12.0.1/arrow/python/lib.h deleted file mode 100644 index bd314e3..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/lib.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Generated by Cython 0.29.35 */ - -#ifndef __PYX_HAVE__pyarrow__lib -#define __PYX_HAVE__pyarrow__lib - -#include "Python.h" - -#ifndef __PYX_HAVE_API__pyarrow__lib - -#ifndef __PYX_EXTERN_C - #ifdef __cplusplus - #define __PYX_EXTERN_C extern "C" - #else - #define __PYX_EXTERN_C extern - #endif -#endif - -#ifndef DL_IMPORT - #define DL_IMPORT(_T) _T -#endif - -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_buffer(std::shared_ptr< arrow::Buffer> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer(std::shared_ptr< arrow::ResizableBuffer> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_data_type(std::shared_ptr< arrow::DataType> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_field(std::shared_ptr< arrow::Field> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_schema(std::shared_ptr< arrow::Schema> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_scalar(std::shared_ptr< arrow::Scalar> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_array(std::shared_ptr< arrow::Array> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_chunked_array(std::shared_ptr< arrow::ChunkedArray> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_coo_tensor(std::shared_ptr< arrow::SparseCOOTensor> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_csc_matrix(std::shared_ptr< arrow::SparseCSCMatrix> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_csf_tensor(std::shared_ptr< arrow::SparseCSFTensor> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_csr_matrix(std::shared_ptr< arrow::SparseCSRMatrix> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_tensor(std::shared_ptr< arrow::Tensor> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_batch(std::shared_ptr< arrow::RecordBatch> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_table(std::shared_ptr< arrow::Table> const &); -__PYX_EXTERN_C std::shared_ptr< arrow::Buffer> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_buffer(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::DataType> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_data_type(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Field> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_field(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Schema> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_schema(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Scalar> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_scalar(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Array> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_array(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::ChunkedArray> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_chunked_array(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::SparseCOOTensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_coo_tensor(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::SparseCSCMatrix> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csc_matrix(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::SparseCSFTensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csf_tensor(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::SparseCSRMatrix> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csr_matrix(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Tensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_tensor(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::RecordBatch> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_batch(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Table> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_table(PyObject *); - -#endif /* !__PYX_HAVE_API__pyarrow__lib */ - -/* WARNING: the interface of the module init function changed in CPython 3.5. */ -/* It now returns a PyModuleDef instance instead of a PyModule instance. */ - -#if PY_MAJOR_VERSION < 3 -PyMODINIT_FUNC initlib(void); -#else -PyMODINIT_FUNC PyInit_lib(void); -#endif - -#endif /* !__PYX_HAVE__pyarrow__lib */ diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/lib_api.h b/src/vendored/apache-arrow-12.0.1/arrow/python/lib_api.h deleted file mode 100644 index 2f5cfa8..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/lib_api.h +++ /dev/null @@ -1,222 +0,0 @@ -/* Generated by Cython 0.29.35 */ - -#ifndef __PYX_HAVE_API__pyarrow__lib -#define __PYX_HAVE_API__pyarrow__lib -#ifdef __MINGW64__ -#define MS_WIN64 -#endif -#include "Python.h" -#include "lib.h" - -static PyObject *(*__pyx_api_f_7pyarrow_3lib_box_memory_pool)( arrow::MemoryPool *) = 0; -#define box_memory_pool __pyx_api_f_7pyarrow_3lib_box_memory_pool -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer)(std::shared_ptr< arrow::Buffer> const &) = 0; -#define pyarrow_wrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer)(std::shared_ptr< arrow::ResizableBuffer> const &) = 0; -#define pyarrow_wrap_resizable_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type)(std::shared_ptr< arrow::DataType> const &) = 0; -#define pyarrow_wrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field)(std::shared_ptr< arrow::Field> const &) = 0; -#define pyarrow_wrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema)(std::shared_ptr< arrow::Schema> const &) = 0; -#define pyarrow_wrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar)(std::shared_ptr< arrow::Scalar> const &) = 0; -#define pyarrow_wrap_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array)(std::shared_ptr< arrow::Array> const &) = 0; -#define pyarrow_wrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array)(std::shared_ptr< arrow::ChunkedArray> const &) = 0; -#define pyarrow_wrap_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_coo_tensor)(std::shared_ptr< arrow::SparseCOOTensor> const &) = 0; -#define pyarrow_wrap_sparse_coo_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_coo_tensor -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csc_matrix)(std::shared_ptr< arrow::SparseCSCMatrix> const &) = 0; -#define pyarrow_wrap_sparse_csc_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csc_matrix -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csf_tensor)(std::shared_ptr< arrow::SparseCSFTensor> const &) = 0; -#define pyarrow_wrap_sparse_csf_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csf_tensor -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csr_matrix)(std::shared_ptr< arrow::SparseCSRMatrix> const &) = 0; -#define pyarrow_wrap_sparse_csr_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csr_matrix -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor)(std::shared_ptr< arrow::Tensor> const &) = 0; -#define pyarrow_wrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch)(std::shared_ptr< arrow::RecordBatch> const &) = 0; -#define pyarrow_wrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table)(std::shared_ptr< arrow::Table> const &) = 0; -#define pyarrow_wrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table -static std::shared_ptr< arrow::Buffer> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer)(PyObject *) = 0; -#define pyarrow_unwrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer -static std::shared_ptr< arrow::DataType> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type)(PyObject *) = 0; -#define pyarrow_unwrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type -static std::shared_ptr< arrow::Field> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field)(PyObject *) = 0; -#define pyarrow_unwrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field -static std::shared_ptr< arrow::Schema> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema)(PyObject *) = 0; -#define pyarrow_unwrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema -static std::shared_ptr< arrow::Scalar> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_scalar)(PyObject *) = 0; -#define pyarrow_unwrap_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_scalar -static std::shared_ptr< arrow::Array> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array)(PyObject *) = 0; -#define pyarrow_unwrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array -static std::shared_ptr< arrow::ChunkedArray> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_chunked_array)(PyObject *) = 0; -#define pyarrow_unwrap_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_chunked_array -static std::shared_ptr< arrow::SparseCOOTensor> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_coo_tensor)(PyObject *) = 0; -#define pyarrow_unwrap_sparse_coo_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_coo_tensor -static std::shared_ptr< arrow::SparseCSCMatrix> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csc_matrix)(PyObject *) = 0; -#define pyarrow_unwrap_sparse_csc_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csc_matrix -static std::shared_ptr< arrow::SparseCSFTensor> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csf_tensor)(PyObject *) = 0; -#define pyarrow_unwrap_sparse_csf_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csf_tensor -static std::shared_ptr< arrow::SparseCSRMatrix> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csr_matrix)(PyObject *) = 0; -#define pyarrow_unwrap_sparse_csr_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csr_matrix -static std::shared_ptr< arrow::Tensor> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor)(PyObject *) = 0; -#define pyarrow_unwrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor -static std::shared_ptr< arrow::RecordBatch> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch)(PyObject *) = 0; -#define pyarrow_unwrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch -static std::shared_ptr< arrow::Table> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table)(PyObject *) = 0; -#define pyarrow_unwrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status)(arrow::Status const &) = 0; -#define pyarrow_internal_check_status __pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer)(PyObject *) = 0; -#define pyarrow_is_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type)(PyObject *) = 0; -#define pyarrow_is_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_metadata)(PyObject *) = 0; -#define pyarrow_is_metadata __pyx_api_f_7pyarrow_3lib_pyarrow_is_metadata -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_field)(PyObject *) = 0; -#define pyarrow_is_field __pyx_api_f_7pyarrow_3lib_pyarrow_is_field -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema)(PyObject *) = 0; -#define pyarrow_is_schema __pyx_api_f_7pyarrow_3lib_pyarrow_is_schema -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_array)(PyObject *) = 0; -#define pyarrow_is_array __pyx_api_f_7pyarrow_3lib_pyarrow_is_array -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_chunked_array)(PyObject *) = 0; -#define pyarrow_is_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_is_chunked_array -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_scalar)(PyObject *) = 0; -#define pyarrow_is_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_is_scalar -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor)(PyObject *) = 0; -#define pyarrow_is_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_coo_tensor)(PyObject *) = 0; -#define pyarrow_is_sparse_coo_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_coo_tensor -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csr_matrix)(PyObject *) = 0; -#define pyarrow_is_sparse_csr_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csr_matrix -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csc_matrix)(PyObject *) = 0; -#define pyarrow_is_sparse_csc_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csc_matrix -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csf_tensor)(PyObject *) = 0; -#define pyarrow_is_sparse_csf_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csf_tensor -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_table)(PyObject *) = 0; -#define pyarrow_is_table __pyx_api_f_7pyarrow_3lib_pyarrow_is_table -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch)(PyObject *) = 0; -#define pyarrow_is_batch __pyx_api_f_7pyarrow_3lib_pyarrow_is_batch -#if !defined(__Pyx_PyIdentifier_FromString) -#if PY_MAJOR_VERSION < 3 - #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s) -#else - #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s) -#endif -#endif - -#ifndef __PYX_HAVE_RT_ImportFunction_0_29_35 -#define __PYX_HAVE_RT_ImportFunction_0_29_35 -static int __Pyx_ImportFunction_0_29_35(PyObject *module, const char *funcname, void (**f)(void), const char *sig) { - PyObject *d = 0; - PyObject *cobj = 0; - union { - void (*fp)(void); - void *p; - } tmp; - d = PyObject_GetAttrString(module, (char *)"__pyx_capi__"); - if (!d) - goto bad; - cobj = PyDict_GetItemString(d, funcname); - if (!cobj) { - PyErr_Format(PyExc_ImportError, - "%.200s does not export expected C function %.200s", - PyModule_GetName(module), funcname); - goto bad; - } -#if PY_VERSION_HEX >= 0x02070000 - if (!PyCapsule_IsValid(cobj, sig)) { - PyErr_Format(PyExc_TypeError, - "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)", - PyModule_GetName(module), funcname, sig, PyCapsule_GetName(cobj)); - goto bad; - } - tmp.p = PyCapsule_GetPointer(cobj, sig); -#else - {const char *desc, *s1, *s2; - desc = (const char *)PyCObject_GetDesc(cobj); - if (!desc) - goto bad; - s1 = desc; s2 = sig; - while (*s1 != '\0' && *s1 == *s2) { s1++; s2++; } - if (*s1 != *s2) { - PyErr_Format(PyExc_TypeError, - "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)", - PyModule_GetName(module), funcname, sig, desc); - goto bad; - } - tmp.p = PyCObject_AsVoidPtr(cobj);} -#endif - *f = tmp.fp; - if (!(*f)) - goto bad; - Py_DECREF(d); - return 0; -bad: - Py_XDECREF(d); - return -1; -} -#endif - - -static int import_pyarrow__lib(void) { - PyObject *module = 0; - module = PyImport_ImportModule("pyarrow.lib"); - if (!module) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "box_memory_pool", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_box_memory_pool, "PyObject *( arrow::MemoryPool *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer, "PyObject *(std::shared_ptr< arrow::Buffer> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_resizable_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer, "PyObject *(std::shared_ptr< arrow::ResizableBuffer> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type, "PyObject *(std::shared_ptr< arrow::DataType> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field, "PyObject *(std::shared_ptr< arrow::Field> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema, "PyObject *(std::shared_ptr< arrow::Schema> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar, "PyObject *(std::shared_ptr< arrow::Scalar> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array, "PyObject *(std::shared_ptr< arrow::Array> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array, "PyObject *(std::shared_ptr< arrow::ChunkedArray> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_sparse_coo_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_coo_tensor, "PyObject *(std::shared_ptr< arrow::SparseCOOTensor> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_sparse_csc_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csc_matrix, "PyObject *(std::shared_ptr< arrow::SparseCSCMatrix> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_sparse_csf_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csf_tensor, "PyObject *(std::shared_ptr< arrow::SparseCSFTensor> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_sparse_csr_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csr_matrix, "PyObject *(std::shared_ptr< arrow::SparseCSRMatrix> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor, "PyObject *(std::shared_ptr< arrow::Tensor> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch, "PyObject *(std::shared_ptr< arrow::RecordBatch> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_wrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table, "PyObject *(std::shared_ptr< arrow::Table> const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer, "std::shared_ptr< arrow::Buffer> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type, "std::shared_ptr< arrow::DataType> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field, "std::shared_ptr< arrow::Field> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema, "std::shared_ptr< arrow::Schema> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_scalar, "std::shared_ptr< arrow::Scalar> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array, "std::shared_ptr< arrow::Array> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_chunked_array, "std::shared_ptr< arrow::ChunkedArray> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_sparse_coo_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_coo_tensor, "std::shared_ptr< arrow::SparseCOOTensor> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_sparse_csc_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csc_matrix, "std::shared_ptr< arrow::SparseCSCMatrix> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_sparse_csf_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csf_tensor, "std::shared_ptr< arrow::SparseCSFTensor> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_sparse_csr_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csr_matrix, "std::shared_ptr< arrow::SparseCSRMatrix> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor, "std::shared_ptr< arrow::Tensor> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch, "std::shared_ptr< arrow::RecordBatch> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_unwrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table, "std::shared_ptr< arrow::Table> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_internal_check_status", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status, "int (arrow::Status const &)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_metadata", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_metadata, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_field, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_array, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_chunked_array, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_scalar, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_sparse_coo_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_coo_tensor, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_sparse_csr_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csr_matrix, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_sparse_csc_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csc_matrix, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_sparse_csf_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csf_tensor, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_table, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction_0_29_35(module, "pyarrow_is_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch, "int (PyObject *)") < 0) goto bad; - Py_DECREF(module); module = 0; - return 0; - bad: - Py_XDECREF(module); - return -1; -} - -#endif /* !__PYX_HAVE_API__pyarrow__lib */ diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.cc deleted file mode 100644 index 4970680..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.cc +++ /dev/null @@ -1,562 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/numpy_interop.h" - -#include "arrow/python/numpy_convert.h" - -#include -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/sparse_tensor.h" -#include "arrow/tensor.h" -#include "arrow/type.h" -#include "arrow/util/logging.h" - -#include "arrow/python/common.h" -#include "arrow/python/pyarrow.h" -#include "arrow/python/type_traits.h" - -namespace arrow { -namespace py { - -NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) { - PyAcquireGIL lock; - arr_ = ao; - Py_INCREF(ao); - - if (PyArray_Check(ao)) { - PyArrayObject* ndarray = reinterpret_cast(ao); - auto ptr = reinterpret_cast(PyArray_DATA(ndarray)); - data_ = const_cast(ptr); - size_ = PyArray_SIZE(ndarray) * PyArray_DESCR(ndarray)->elsize; - capacity_ = size_; - is_mutable_ = !!(PyArray_FLAGS(ndarray) & NPY_ARRAY_WRITEABLE); - } -} - -NumPyBuffer::~NumPyBuffer() { - PyAcquireGIL lock; - Py_XDECREF(arr_); -} - -#define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \ - case NPY_##NPY_NAME: \ - *out = FACTORY(); \ - break; - -namespace { - -Status GetTensorType(PyObject* dtype, std::shared_ptr* out) { - if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { - return Status::TypeError("Did not pass numpy.dtype object"); - } - PyArray_Descr* descr = reinterpret_cast(dtype); - int type_num = fix_numpy_type_num(descr->type_num); - - switch (type_num) { - TO_ARROW_TYPE_CASE(BOOL, uint8); - TO_ARROW_TYPE_CASE(INT8, int8); - TO_ARROW_TYPE_CASE(INT16, int16); - TO_ARROW_TYPE_CASE(INT32, int32); - TO_ARROW_TYPE_CASE(INT64, int64); - TO_ARROW_TYPE_CASE(UINT8, uint8); - TO_ARROW_TYPE_CASE(UINT16, uint16); - TO_ARROW_TYPE_CASE(UINT32, uint32); - TO_ARROW_TYPE_CASE(UINT64, uint64); - TO_ARROW_TYPE_CASE(FLOAT16, float16); - TO_ARROW_TYPE_CASE(FLOAT32, float32); - TO_ARROW_TYPE_CASE(FLOAT64, float64); - default: { - return Status::NotImplemented("Unsupported numpy type ", descr->type_num); - } - } - return Status::OK(); -} - -Status GetNumPyType(const DataType& type, int* type_num) { -#define NUMPY_TYPE_CASE(ARROW_NAME, NPY_NAME) \ - case Type::ARROW_NAME: \ - *type_num = NPY_##NPY_NAME; \ - break; - - switch (type.id()) { - NUMPY_TYPE_CASE(UINT8, UINT8); - NUMPY_TYPE_CASE(INT8, INT8); - NUMPY_TYPE_CASE(UINT16, UINT16); - NUMPY_TYPE_CASE(INT16, INT16); - NUMPY_TYPE_CASE(UINT32, UINT32); - NUMPY_TYPE_CASE(INT32, INT32); - NUMPY_TYPE_CASE(UINT64, UINT64); - NUMPY_TYPE_CASE(INT64, INT64); - NUMPY_TYPE_CASE(HALF_FLOAT, FLOAT16); - NUMPY_TYPE_CASE(FLOAT, FLOAT32); - NUMPY_TYPE_CASE(DOUBLE, FLOAT64); - default: { - return Status::NotImplemented("Unsupported tensor type: ", type.ToString()); - } - } -#undef NUMPY_TYPE_CASE - - return Status::OK(); -} - -} // namespace - -Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr* out) { - if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { - return Status::TypeError("Did not pass numpy.dtype object"); - } - PyArray_Descr* descr = reinterpret_cast(dtype); - return NumPyDtypeToArrow(descr, out); -} - -Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out) { - int type_num = fix_numpy_type_num(descr->type_num); - - switch (type_num) { - TO_ARROW_TYPE_CASE(BOOL, boolean); - TO_ARROW_TYPE_CASE(INT8, int8); - TO_ARROW_TYPE_CASE(INT16, int16); - TO_ARROW_TYPE_CASE(INT32, int32); - TO_ARROW_TYPE_CASE(INT64, int64); - TO_ARROW_TYPE_CASE(UINT8, uint8); - TO_ARROW_TYPE_CASE(UINT16, uint16); - TO_ARROW_TYPE_CASE(UINT32, uint32); - TO_ARROW_TYPE_CASE(UINT64, uint64); - TO_ARROW_TYPE_CASE(FLOAT16, float16); - TO_ARROW_TYPE_CASE(FLOAT32, float32); - TO_ARROW_TYPE_CASE(FLOAT64, float64); - TO_ARROW_TYPE_CASE(STRING, binary); - TO_ARROW_TYPE_CASE(UNICODE, utf8); - case NPY_DATETIME: { - auto date_dtype = - reinterpret_cast(descr->c_metadata); - switch (date_dtype->meta.base) { - case NPY_FR_s: - *out = timestamp(TimeUnit::SECOND); - break; - case NPY_FR_ms: - *out = timestamp(TimeUnit::MILLI); - break; - case NPY_FR_us: - *out = timestamp(TimeUnit::MICRO); - break; - case NPY_FR_ns: - *out = timestamp(TimeUnit::NANO); - break; - case NPY_FR_D: - *out = date32(); - break; - case NPY_FR_GENERIC: - return Status::NotImplemented("Unbound or generic datetime64 time unit"); - default: - return Status::NotImplemented("Unsupported datetime64 time unit"); - } - } break; - case NPY_TIMEDELTA: { - auto timedelta_dtype = - reinterpret_cast(descr->c_metadata); - switch (timedelta_dtype->meta.base) { - case NPY_FR_s: - *out = duration(TimeUnit::SECOND); - break; - case NPY_FR_ms: - *out = duration(TimeUnit::MILLI); - break; - case NPY_FR_us: - *out = duration(TimeUnit::MICRO); - break; - case NPY_FR_ns: - *out = duration(TimeUnit::NANO); - break; - case NPY_FR_GENERIC: - return Status::NotImplemented("Unbound or generic timedelta64 time unit"); - default: - return Status::NotImplemented("Unsupported timedelta64 time unit"); - } - } break; - default: { - return Status::NotImplemented("Unsupported numpy type ", descr->type_num); - } - } - - return Status::OK(); -} - -#undef TO_ARROW_TYPE_CASE - -Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, - const std::vector& dim_names, - std::shared_ptr* out) { - if (!PyArray_Check(ao)) { - return Status::TypeError("Did not pass ndarray object"); - } - - PyArrayObject* ndarray = reinterpret_cast(ao); - - // TODO(wesm): What do we want to do with non-contiguous memory and negative strides? - - int ndim = PyArray_NDIM(ndarray); - - std::shared_ptr data = std::make_shared(ao); - std::vector shape(ndim); - std::vector strides(ndim); - - npy_intp* array_strides = PyArray_STRIDES(ndarray); - npy_intp* array_shape = PyArray_SHAPE(ndarray); - for (int i = 0; i < ndim; ++i) { - if (array_strides[i] < 0) { - return Status::Invalid("Negative ndarray strides not supported"); - } - shape[i] = array_shape[i]; - strides[i] = array_strides[i]; - } - - std::shared_ptr type; - RETURN_NOT_OK( - GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); - *out = std::make_shared(type, data, shape, strides, dim_names); - return Status::OK(); -} - -Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, - PyObject** out) { - int type_num = 0; - RETURN_NOT_OK(GetNumPyType(*tensor->type(), &type_num)); - PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num); - RETURN_IF_PYERROR(); - - const int ndim = tensor->ndim(); - std::vector npy_shape(ndim); - std::vector npy_strides(ndim); - - for (int i = 0; i < ndim; ++i) { - npy_shape[i] = tensor->shape()[i]; - npy_strides[i] = tensor->strides()[i]; - } - - const void* immutable_data = nullptr; - if (tensor->data()) { - immutable_data = tensor->data()->data(); - } - - // Remove const =( - void* mutable_data = const_cast(immutable_data); - - int array_flags = 0; - if (tensor->is_row_major()) { - array_flags |= NPY_ARRAY_C_CONTIGUOUS; - } - if (tensor->is_column_major()) { - array_flags |= NPY_ARRAY_F_CONTIGUOUS; - } - if (tensor->is_mutable()) { - array_flags |= NPY_ARRAY_WRITEABLE; - } - - PyObject* result = - PyArray_NewFromDescr(&PyArray_Type, dtype, ndim, npy_shape.data(), - npy_strides.data(), mutable_data, array_flags, nullptr); - RETURN_IF_PYERROR(); - - if (base == Py_None || base == nullptr) { - base = py::wrap_tensor(tensor); - } else { - Py_XINCREF(base); - } - PyArray_SetBaseObject(reinterpret_cast(result), base); - *out = result; - return Status::OK(); -} - -// Wrap the dense data of a sparse tensor in a ndarray -static Status SparseTensorDataToNdarray(const SparseTensor& sparse_tensor, - std::vector data_shape, PyObject* base, - PyObject** out_data) { - int type_num_data = 0; - RETURN_NOT_OK(GetNumPyType(*sparse_tensor.type(), &type_num_data)); - PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data); - RETURN_IF_PYERROR(); - - const void* immutable_data = sparse_tensor.data()->data(); - // Remove const =( - void* mutable_data = const_cast(immutable_data); - int array_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS; - if (sparse_tensor.is_mutable()) { - array_flags |= NPY_ARRAY_WRITEABLE; - } - - *out_data = PyArray_NewFromDescr(&PyArray_Type, dtype_data, - static_cast(data_shape.size()), data_shape.data(), - nullptr, mutable_data, array_flags, nullptr); - RETURN_IF_PYERROR(); - Py_XINCREF(base); - PyArray_SetBaseObject(reinterpret_cast(*out_data), base); - return Status::OK(); -} - -Status SparseCOOTensorToNdarray(const std::shared_ptr& sparse_tensor, - PyObject* base, PyObject** out_data, - PyObject** out_coords) { - const auto& sparse_index = arrow::internal::checked_cast( - *sparse_tensor->sparse_index()); - - // Wrap tensor data - OwnedRef result_data; - RETURN_NOT_OK(SparseTensorDataToNdarray( - *sparse_tensor, {static_cast(sparse_tensor->non_zero_length()), 1}, base, - result_data.ref())); - - // Wrap indices - PyObject* result_coords; - RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, &result_coords)); - - *out_data = result_data.detach(); - *out_coords = result_coords; - return Status::OK(); -} - -Status SparseCSXMatrixToNdarray(const std::shared_ptr& sparse_tensor, - PyObject* base, PyObject** out_data, - PyObject** out_indptr, PyObject** out_indices) { - // Wrap indices - OwnedRef result_indptr; - OwnedRef result_indices; - - switch (sparse_tensor->format_id()) { - case SparseTensorFormat::CSR: { - const auto& sparse_index = arrow::internal::checked_cast( - *sparse_tensor->sparse_index()); - RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref())); - RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref())); - break; - } - case SparseTensorFormat::CSC: { - const auto& sparse_index = arrow::internal::checked_cast( - *sparse_tensor->sparse_index()); - RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref())); - RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref())); - break; - } - default: - return Status::NotImplemented("Invalid SparseTensor type."); - } - - // Wrap tensor data - OwnedRef result_data; - RETURN_NOT_OK(SparseTensorDataToNdarray( - *sparse_tensor, {static_cast(sparse_tensor->non_zero_length()), 1}, base, - result_data.ref())); - - *out_data = result_data.detach(); - *out_indptr = result_indptr.detach(); - *out_indices = result_indices.detach(); - return Status::OK(); -} - -Status SparseCSRMatrixToNdarray(const std::shared_ptr& sparse_tensor, - PyObject* base, PyObject** out_data, - PyObject** out_indptr, PyObject** out_indices) { - return SparseCSXMatrixToNdarray(sparse_tensor, base, out_data, out_indptr, out_indices); -} - -Status SparseCSCMatrixToNdarray(const std::shared_ptr& sparse_tensor, - PyObject* base, PyObject** out_data, - PyObject** out_indptr, PyObject** out_indices) { - return SparseCSXMatrixToNdarray(sparse_tensor, base, out_data, out_indptr, out_indices); -} - -Status SparseCSFTensorToNdarray(const std::shared_ptr& sparse_tensor, - PyObject* base, PyObject** out_data, - PyObject** out_indptr, PyObject** out_indices) { - const auto& sparse_index = arrow::internal::checked_cast( - *sparse_tensor->sparse_index()); - - // Wrap tensor data - OwnedRef result_data; - RETURN_NOT_OK(SparseTensorDataToNdarray( - *sparse_tensor, {static_cast(sparse_tensor->non_zero_length()), 1}, base, - result_data.ref())); - - // Wrap indices - int ndim = static_cast(sparse_index.indices().size()); - OwnedRef indptr(PyList_New(ndim - 1)); - OwnedRef indices(PyList_New(ndim)); - RETURN_IF_PYERROR(); - - for (int i = 0; i < ndim - 1; ++i) { - PyObject* item; - RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr()[i], base, &item)); - if (PyList_SetItem(indptr.obj(), i, item) < 0) { - Py_XDECREF(item); - RETURN_IF_PYERROR(); - } - } - for (int i = 0; i < ndim; ++i) { - PyObject* item; - RETURN_NOT_OK(TensorToNdarray(sparse_index.indices()[i], base, &item)); - if (PyList_SetItem(indices.obj(), i, item) < 0) { - Py_XDECREF(item); - RETURN_IF_PYERROR(); - } - } - - *out_indptr = indptr.detach(); - *out_indices = indices.detach(); - *out_data = result_data.detach(); - return Status::OK(); -} - -Status NdarraysToSparseCOOTensor(MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao, - const std::vector& shape, - const std::vector& dim_names, - std::shared_ptr* out) { - if (!PyArray_Check(data_ao) || !PyArray_Check(coords_ao)) { - return Status::TypeError("Did not pass ndarray object"); - } - - PyArrayObject* ndarray_data = reinterpret_cast(data_ao); - std::shared_ptr data = std::make_shared(data_ao); - std::shared_ptr type_data; - RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), - &type_data)); - - std::shared_ptr coords; - RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords)); - ARROW_CHECK_EQ(coords->type_id(), Type::INT64); // Should be ensured by caller - - ARROW_ASSIGN_OR_RAISE(std::shared_ptr sparse_index, - SparseCOOIndex::Make(coords)); - *out = std::make_shared>(sparse_index, type_data, data, - shape, dim_names); - return Status::OK(); -} - -template -Status NdarraysToSparseCSXMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, - PyObject* indices_ao, const std::vector& shape, - const std::vector& dim_names, - std::shared_ptr>* out) { - if (!PyArray_Check(data_ao) || !PyArray_Check(indptr_ao) || - !PyArray_Check(indices_ao)) { - return Status::TypeError("Did not pass ndarray object"); - } - - PyArrayObject* ndarray_data = reinterpret_cast(data_ao); - std::shared_ptr data = std::make_shared(data_ao); - std::shared_ptr type_data; - RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), - &type_data)); - - std::shared_ptr indptr, indices; - RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr)); - RETURN_NOT_OK(NdarrayToTensor(pool, indices_ao, {}, &indices)); - ARROW_CHECK_EQ(indptr->type_id(), Type::INT64); // Should be ensured by caller - ARROW_CHECK_EQ(indices->type_id(), Type::INT64); // Should be ensured by caller - - auto sparse_index = std::make_shared( - std::static_pointer_cast>(indptr), - std::static_pointer_cast>(indices)); - *out = std::make_shared>(sparse_index, type_data, data, - shape, dim_names); - return Status::OK(); -} - -Status NdarraysToSparseCSFTensor(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, - PyObject* indices_ao, const std::vector& shape, - const std::vector& axis_order, - const std::vector& dim_names, - std::shared_ptr* out) { - if (!PyArray_Check(data_ao)) { - return Status::TypeError("Did not pass ndarray object for data"); - } - const int ndim = static_cast(shape.size()); - PyArrayObject* ndarray_data = reinterpret_cast(data_ao); - std::shared_ptr data = std::make_shared(data_ao); - std::shared_ptr type_data; - RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), - &type_data)); - - std::vector> indptr(ndim - 1); - std::vector> indices(ndim); - - for (int i = 0; i < ndim - 1; ++i) { - PyObject* item = PySequence_Fast_GET_ITEM(indptr_ao, i); - if (!PyArray_Check(item)) { - return Status::TypeError("Did not pass ndarray object for indptr"); - } - RETURN_NOT_OK(NdarrayToTensor(pool, item, {}, &indptr[i])); - ARROW_CHECK_EQ(indptr[i]->type_id(), Type::INT64); // Should be ensured by caller - } - - for (int i = 0; i < ndim; ++i) { - PyObject* item = PySequence_Fast_GET_ITEM(indices_ao, i); - if (!PyArray_Check(item)) { - return Status::TypeError("Did not pass ndarray object for indices"); - } - RETURN_NOT_OK(NdarrayToTensor(pool, item, {}, &indices[i])); - ARROW_CHECK_EQ(indices[i]->type_id(), Type::INT64); // Should be ensured by caller - } - - auto sparse_index = std::make_shared(indptr, indices, axis_order); - *out = std::make_shared>(sparse_index, type_data, data, - shape, dim_names); - return Status::OK(); -} - -Status NdarraysToSparseCSRMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, - PyObject* indices_ao, const std::vector& shape, - const std::vector& dim_names, - std::shared_ptr* out) { - return NdarraysToSparseCSXMatrix(pool, data_ao, indptr_ao, indices_ao, - shape, dim_names, out); -} - -Status NdarraysToSparseCSCMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, - PyObject* indices_ao, const std::vector& shape, - const std::vector& dim_names, - std::shared_ptr* out) { - return NdarraysToSparseCSXMatrix(pool, data_ao, indptr_ao, indices_ao, - shape, dim_names, out); -} - -Status TensorToSparseCOOTensor(const std::shared_ptr& tensor, - std::shared_ptr* out) { - return SparseCOOTensor::Make(*tensor).Value(out); -} - -Status TensorToSparseCSRMatrix(const std::shared_ptr& tensor, - std::shared_ptr* out) { - return SparseCSRMatrix::Make(*tensor).Value(out); -} - -Status TensorToSparseCSCMatrix(const std::shared_ptr& tensor, - std::shared_ptr* out) { - return SparseCSCMatrix::Make(*tensor).Value(out); -} - -Status TensorToSparseCSFTensor(const std::shared_ptr& tensor, - std::shared_ptr* out) { - return SparseCSFTensor::Make(*tensor).Value(out); -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.h b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.h deleted file mode 100644 index 1045107..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.h +++ /dev/null @@ -1,120 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between pandas's NumPy-based data representation -// and Arrow data structures - -#pragma once - -#include "arrow/python/platform.h" - -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/python/visibility.h" -#include "arrow/sparse_tensor.h" - -namespace arrow { - -class DataType; -class MemoryPool; -class Status; -class Tensor; - -namespace py { - -class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer { - public: - explicit NumPyBuffer(PyObject* arr); - virtual ~NumPyBuffer(); - - private: - PyObject* arr_; -}; - -ARROW_PYTHON_EXPORT -Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr* out); -ARROW_PYTHON_EXPORT -Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out); - -ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, - const std::vector& dim_names, - std::shared_ptr* out); - -ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr& tensor, - PyObject* base, PyObject** out); - -ARROW_PYTHON_EXPORT Status -SparseCOOTensorToNdarray(const std::shared_ptr& sparse_tensor, - PyObject* base, PyObject** out_data, PyObject** out_coords); - -Status SparseCSXMatrixToNdarray(const std::shared_ptr& sparse_tensor, - PyObject* base, PyObject** out_data, - PyObject** out_indptr, PyObject** out_indices); - -ARROW_PYTHON_EXPORT Status SparseCSRMatrixToNdarray( - const std::shared_ptr& sparse_tensor, PyObject* base, - PyObject** out_data, PyObject** out_indptr, PyObject** out_indices); - -ARROW_PYTHON_EXPORT Status SparseCSCMatrixToNdarray( - const std::shared_ptr& sparse_tensor, PyObject* base, - PyObject** out_data, PyObject** out_indptr, PyObject** out_indices); - -ARROW_PYTHON_EXPORT Status SparseCSFTensorToNdarray( - const std::shared_ptr& sparse_tensor, PyObject* base, - PyObject** out_data, PyObject** out_indptr, PyObject** out_indices); - -ARROW_PYTHON_EXPORT Status NdarraysToSparseCOOTensor( - MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao, - const std::vector& shape, const std::vector& dim_names, - std::shared_ptr* out); - -ARROW_PYTHON_EXPORT Status NdarraysToSparseCSRMatrix( - MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao, - const std::vector& shape, const std::vector& dim_names, - std::shared_ptr* out); - -ARROW_PYTHON_EXPORT Status NdarraysToSparseCSCMatrix( - MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao, - const std::vector& shape, const std::vector& dim_names, - std::shared_ptr* out); - -ARROW_PYTHON_EXPORT Status NdarraysToSparseCSFTensor( - MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao, - const std::vector& shape, const std::vector& axis_order, - const std::vector& dim_names, std::shared_ptr* out); - -ARROW_PYTHON_EXPORT Status -TensorToSparseCOOTensor(const std::shared_ptr& tensor, - std::shared_ptr* csparse_tensor); - -ARROW_PYTHON_EXPORT Status -TensorToSparseCSRMatrix(const std::shared_ptr& tensor, - std::shared_ptr* csparse_tensor); - -ARROW_PYTHON_EXPORT Status -TensorToSparseCSCMatrix(const std::shared_ptr& tensor, - std::shared_ptr* csparse_tensor); - -ARROW_PYTHON_EXPORT Status -TensorToSparseCSFTensor(const std::shared_ptr& tensor, - std::shared_ptr* csparse_tensor); - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_internal.h b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_internal.h deleted file mode 100644 index b9b632f..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_internal.h +++ /dev/null @@ -1,182 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Internal utilities for dealing with NumPy - -#pragma once - -#include "arrow/python/numpy_interop.h" - -#include "arrow/status.h" - -#include "arrow/python/platform.h" - -#include -#include -#include - -namespace arrow { -namespace py { - -/// Indexing convenience for interacting with strided 1-dim ndarray objects -template -class Ndarray1DIndexer { - public: - typedef int64_t size_type; - - Ndarray1DIndexer() : arr_(NULLPTR), data_(NULLPTR) {} - - explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() { - arr_ = arr; - DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays"; - data_ = reinterpret_cast(PyArray_DATA(arr)); - stride_ = PyArray_STRIDES(arr)[0]; - } - - ~Ndarray1DIndexer() = default; - - int64_t size() const { return PyArray_SIZE(arr_); } - - const T* data() const { return reinterpret_cast(data_); } - - bool is_strided() const { return stride_ != sizeof(T); } - - T& operator[](size_type index) { - return *reinterpret_cast(data_ + index * stride_); - } - const T& operator[](size_type index) const { - return *reinterpret_cast(data_ + index * stride_); - } - - private: - PyArrayObject* arr_; - uint8_t* data_; - int64_t stride_; -}; - -// Handling of Numpy Types by their static numbers -// (the NPY_TYPES enum and related defines) - -static inline std::string GetNumPyTypeName(int npy_type) { -#define TYPE_CASE(TYPE, NAME) \ - case NPY_##TYPE: \ - return NAME; - - switch (npy_type) { - TYPE_CASE(BOOL, "bool") - TYPE_CASE(INT8, "int8") - TYPE_CASE(INT16, "int16") - TYPE_CASE(INT32, "int32") - TYPE_CASE(INT64, "int64") -#if !NPY_INT32_IS_INT - TYPE_CASE(INT, "intc") -#endif -#if !NPY_INT64_IS_LONG_LONG - TYPE_CASE(LONGLONG, "longlong") -#endif - TYPE_CASE(UINT8, "uint8") - TYPE_CASE(UINT16, "uint16") - TYPE_CASE(UINT32, "uint32") - TYPE_CASE(UINT64, "uint64") -#if !NPY_INT32_IS_INT - TYPE_CASE(UINT, "uintc") -#endif -#if !NPY_INT64_IS_LONG_LONG - TYPE_CASE(ULONGLONG, "ulonglong") -#endif - TYPE_CASE(FLOAT16, "float16") - TYPE_CASE(FLOAT32, "float32") - TYPE_CASE(FLOAT64, "float64") - TYPE_CASE(DATETIME, "datetime64") - TYPE_CASE(TIMEDELTA, "timedelta64") - TYPE_CASE(OBJECT, "object") - TYPE_CASE(VOID, "void") - default: - break; - } - -#undef TYPE_CASE - std::stringstream ss; - ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName"; - return ss.str(); -} - -#define TYPE_VISIT_INLINE(TYPE) \ - case NPY_##TYPE: \ - return visitor->template Visit(arr); - -template -inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) { - switch (PyArray_TYPE(arr)) { - TYPE_VISIT_INLINE(BOOL); - TYPE_VISIT_INLINE(INT8); - TYPE_VISIT_INLINE(UINT8); - TYPE_VISIT_INLINE(INT16); - TYPE_VISIT_INLINE(UINT16); - TYPE_VISIT_INLINE(INT32); - TYPE_VISIT_INLINE(UINT32); - TYPE_VISIT_INLINE(INT64); - TYPE_VISIT_INLINE(UINT64); -#if !NPY_INT32_IS_INT - TYPE_VISIT_INLINE(INT); - TYPE_VISIT_INLINE(UINT); -#endif -#if !NPY_INT64_IS_LONG_LONG - TYPE_VISIT_INLINE(LONGLONG); - TYPE_VISIT_INLINE(ULONGLONG); -#endif - TYPE_VISIT_INLINE(FLOAT16); - TYPE_VISIT_INLINE(FLOAT32); - TYPE_VISIT_INLINE(FLOAT64); - TYPE_VISIT_INLINE(DATETIME); - TYPE_VISIT_INLINE(TIMEDELTA); - TYPE_VISIT_INLINE(OBJECT); - } - return Status::NotImplemented("NumPy type not implemented: ", - GetNumPyTypeName(PyArray_TYPE(arr))); -} - -#undef TYPE_VISIT_INLINE - -namespace internal { - -inline bool PyFloatScalar_Check(PyObject* obj) { - return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating); -} - -inline bool PyIntScalar_Check(PyObject* obj) { - return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer); -} - -inline bool PyBoolScalar_Check(PyObject* obj) { - return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool); -} - -static inline PyArray_Descr* GetSafeNumPyDtype(int type) { - if (type == NPY_DATETIME || type == NPY_TIMEDELTA) { - // It is not safe to mutate the result of DescrFromType for datetime and - // timedelta descriptors - return PyArray_DescrNewFromType(type); - } else { - return PyArray_DescrFromType(type); - } -} - -} // namespace internal - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_interop.h b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_interop.h deleted file mode 100644 index ce7baed..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_interop.h +++ /dev/null @@ -1,96 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/python/platform.h" // IWYU pragma: export - -#include // IWYU pragma: export - -// Don't use the deprecated Numpy functions -#ifdef NPY_1_7_API_VERSION -#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#else -#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED -#define NPY_ARRAY_ALIGNED NPY_ALIGNED -#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE -#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY -#endif - -// This is required to be able to access the NumPy C API properly in C++ files -// other than init.cc. -#define PY_ARRAY_UNIQUE_SYMBOL arrow_ARRAY_API -#ifndef NUMPY_IMPORT_ARRAY -#define NO_IMPORT_ARRAY -#endif - -#include // IWYU pragma: export -#include // IWYU pragma: export -#include // IWYU pragma: export - -// A bit subtle. Numpy has 5 canonical integer types: -// (or, rather, type pairs: signed and unsigned) -// NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG -// It also has 4 fixed-width integer aliases. -// When mapping Arrow integer types to these 4 fixed-width aliases, -// we always miss one of the canonical types (even though it may -// have the same width as one of the aliases). -// Which one depends on the platform... -// On a LP64 system, NPY_INT64 maps to NPY_LONG and -// NPY_LONGLONG needs to be handled separately. -// On a LLP64 system, NPY_INT32 maps to NPY_LONG and -// NPY_INT needs to be handled separately. - -#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64 -#define NPY_INT64_IS_LONG_LONG 1 -#else -#define NPY_INT64_IS_LONG_LONG 0 -#endif - -#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64 -#define NPY_INT32_IS_INT 1 -#else -#define NPY_INT32_IS_INT 0 -#endif - -namespace arrow { -namespace py { - -inline int import_numpy() { -#ifdef NUMPY_IMPORT_ARRAY - import_array1(-1); - import_umath1(-1); -#endif - - return 0; -} - -// See above about the missing Numpy integer type numbers -inline int fix_numpy_type_num(int type_num) { -#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32 - if (type_num == NPY_INT) return NPY_INT32; - if (type_num == NPY_UINT) return NPY_UINT32; -#endif -#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64 - if (type_num == NPY_LONGLONG) return NPY_INT64; - if (type_num == NPY_ULONGLONG) return NPY_UINT64; -#endif - return type_num; -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.cc deleted file mode 100644 index 2727ce3..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.cc +++ /dev/null @@ -1,870 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for pandas conversion via NumPy - -#include "arrow/python/numpy_to_arrow.h" -#include "arrow/python/numpy_interop.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/array/builder_binary.h" -#include "arrow/status.h" -#include "arrow/table.h" -#include "arrow/type_fwd.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit_util.h" -#include "arrow/util/bitmap_generate.h" -#include "arrow/util/bitmap_ops.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/endian.h" -#include "arrow/util/logging.h" -#include "arrow/util/macros.h" -#include "arrow/util/string.h" -#include "arrow/util/utf8.h" -#include "arrow/visit_type_inline.h" - -#include "arrow/compute/api_scalar.h" - -#include "arrow/python/common.h" -#include "arrow/python/datetime.h" -#include "arrow/python/helpers.h" -#include "arrow/python/iterators.h" -#include "arrow/python/numpy_convert.h" -#include "arrow/python/numpy_internal.h" -#include "arrow/python/python_to_arrow.h" -#include "arrow/python/type_traits.h" - -namespace arrow { - -using internal::checked_cast; -using internal::CopyBitmap; -using internal::GenerateBitsUnrolled; - -namespace py { - -using internal::NumPyTypeSize; - -// ---------------------------------------------------------------------- -// Conversion utilities - -namespace { - -Status AllocateNullBitmap(MemoryPool* pool, int64_t length, - std::shared_ptr* out) { - int64_t null_bytes = bit_util::BytesForBits(length); - ARROW_ASSIGN_OR_RAISE(auto null_bitmap, AllocateResizableBuffer(null_bytes, pool)); - - // Padding zeroed by AllocateResizableBuffer - memset(null_bitmap->mutable_data(), 0, static_cast(null_bytes)); - *out = std::move(null_bitmap); - return Status::OK(); -} - -// ---------------------------------------------------------------------- -// Conversion from NumPy-in-Pandas to Arrow null bitmap - -template -inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { - typedef internal::npy_traits traits; - typedef typename traits::value_type T; - - int64_t null_count = 0; - - Ndarray1DIndexer values(arr); - for (int i = 0; i < values.size(); ++i) { - if (traits::isnull(values[i])) { - ++null_count; - } else { - bit_util::SetBit(bitmap, i); - } - } - - return null_count; -} - -class NumPyNullsConverter { - public: - /// Convert the given array's null values to a null bitmap. - /// The null bitmap is only allocated if null values are ever possible. - static Status Convert(MemoryPool* pool, PyArrayObject* arr, bool from_pandas, - std::shared_ptr* out_null_bitmap_, - int64_t* out_null_count) { - NumPyNullsConverter converter(pool, arr, from_pandas); - RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter)); - *out_null_bitmap_ = converter.null_bitmap_; - *out_null_count = converter.null_count_; - return Status::OK(); - } - - template - Status Visit(PyArrayObject* arr) { - typedef internal::npy_traits traits; - - const bool null_sentinels_possible = - // Always treat Numpy's NaT as null - TYPE == NPY_DATETIME || TYPE == NPY_TIMEDELTA || - // Observing pandas's null sentinels - (from_pandas_ && traits::supports_nulls); - - if (null_sentinels_possible) { - RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_)); - null_count_ = ValuesToBitmap(arr, null_bitmap_->mutable_data()); - } - return Status::OK(); - } - - protected: - NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr, bool from_pandas) - : pool_(pool), - arr_(arr), - from_pandas_(from_pandas), - null_bitmap_data_(nullptr), - null_count_(0) {} - - MemoryPool* pool_; - PyArrayObject* arr_; - bool from_pandas_; - std::shared_ptr null_bitmap_; - uint8_t* null_bitmap_data_; - int64_t null_count_; -}; - -// Returns null count -int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { - int64_t null_count = 0; - - if (!PyArray_Check(mask)) return -1; - - Ndarray1DIndexer mask_values(mask); - for (int i = 0; i < length; ++i) { - if (mask_values[i]) { - ++null_count; - bit_util::ClearBit(bitmap, i); - } else { - bit_util::SetBit(bitmap, i); - } - } - return null_count; -} - -} // namespace - -// ---------------------------------------------------------------------- -// Conversion from NumPy arrays (possibly originating from pandas) to Arrow -// format. Does not handle NPY_OBJECT dtype arrays; use ConvertPySequence for -// that - -class NumPyConverter { - public: - NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo, - const std::shared_ptr& type, bool from_pandas, - const compute::CastOptions& cast_options = compute::CastOptions()) - : pool_(pool), - type_(type), - arr_(reinterpret_cast(arr)), - dtype_(PyArray_DESCR(arr_)), - mask_(nullptr), - from_pandas_(from_pandas), - cast_options_(cast_options), - null_bitmap_data_(nullptr), - null_count_(0) { - if (mo != nullptr && mo != Py_None) { - mask_ = reinterpret_cast(mo); - } - length_ = static_cast(PyArray_SIZE(arr_)); - itemsize_ = static_cast(PyArray_DESCR(arr_)->elsize); - stride_ = static_cast(PyArray_STRIDES(arr_)[0]); - } - - bool is_strided() const { return itemsize_ != stride_; } - - Status Convert(); - - const ArrayVector& result() const { return out_arrays_; } - - template - enable_if_primitive_ctype Visit(const T& type) { - return VisitNative(); - } - - Status Visit(const HalfFloatType& type) { return VisitNative(); } - - Status Visit(const Date32Type& type) { return VisitNative(); } - Status Visit(const Date64Type& type) { return VisitNative(); } - Status Visit(const TimestampType& type) { return VisitNative(); } - Status Visit(const Time32Type& type) { return VisitNative(); } - Status Visit(const Time64Type& type) { return VisitNative(); } - Status Visit(const DurationType& type) { return VisitNative(); } - - Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); } - - // NumPy ascii string arrays - Status Visit(const BinaryType& type); - - // NumPy unicode arrays - Status Visit(const StringType& type); - - Status Visit(const StructType& type); - - Status Visit(const FixedSizeBinaryType& type); - - // Default case - Status Visit(const DataType& type) { return TypeNotImplemented(type.ToString()); } - - protected: - Status InitNullBitmap() { - RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_)); - null_bitmap_data_ = null_bitmap_->mutable_data(); - return Status::OK(); - } - - // Called before ConvertData to ensure Numpy input buffer is in expected - // Arrow layout - template - Status PrepareInputData(std::shared_ptr* data); - - // ---------------------------------------------------------------------- - // Traditional visitor conversion for non-object arrays - - template - Status ConvertData(std::shared_ptr* data); - - template - Status PushBuilderResult(T* builder) { - std::shared_ptr out; - RETURN_NOT_OK(builder->Finish(&out)); - out_arrays_.emplace_back(out); - return Status::OK(); - } - - Status PushArray(const std::shared_ptr& data) { - out_arrays_.emplace_back(MakeArray(data)); - return Status::OK(); - } - - template - Status VisitNative() { - if (mask_ != nullptr) { - RETURN_NOT_OK(InitNullBitmap()); - null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_); - if (null_count_ == -1) return Status::Invalid("Invalid mask type"); - } else { - RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_, - &null_count_)); - } - - std::shared_ptr data; - RETURN_NOT_OK(ConvertData(&data)); - - auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0); - return PushArray(arr_data); - } - - Status TypeNotImplemented(std::string type_name) { - return Status::NotImplemented("NumPyConverter doesn't implement <", type_name, - "> conversion. "); - } - - MemoryPool* pool_; - std::shared_ptr type_; - PyArrayObject* arr_; - PyArray_Descr* dtype_; - PyArrayObject* mask_; - int64_t length_; - int64_t stride_; - int itemsize_; - - bool from_pandas_; - compute::CastOptions cast_options_; - - // Used in visitor pattern - ArrayVector out_arrays_; - - std::shared_ptr null_bitmap_; - uint8_t* null_bitmap_data_; - int64_t null_count_; -}; - -Status NumPyConverter::Convert() { - if (PyArray_NDIM(arr_) != 1) { - return Status::Invalid("only handle 1-dimensional arrays"); - } - - if (dtype_->type_num == NPY_OBJECT) { - // If an object array, convert it like a normal Python sequence - PyConversionOptions py_options; - py_options.type = type_; - py_options.from_pandas = from_pandas_; - ARROW_ASSIGN_OR_RAISE( - auto chunked_array, - ConvertPySequence(reinterpret_cast(arr_), - reinterpret_cast(mask_), py_options, pool_)); - out_arrays_ = chunked_array->chunks(); - return Status::OK(); - } - - if (type_ == nullptr) { - return Status::Invalid("Must pass data type for non-object arrays"); - } - - // Visit the type to perform conversion - return VisitTypeInline(*type_, this); -} - -namespace { - -Status CastBuffer(const std::shared_ptr& in_type, - const std::shared_ptr& input, const int64_t length, - const std::shared_ptr& valid_bitmap, const int64_t null_count, - const std::shared_ptr& out_type, - const compute::CastOptions& cast_options, MemoryPool* pool, - std::shared_ptr* out) { - // Must cast - auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count); - compute::ExecContext context(pool); - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr casted_array, - compute::Cast(*MakeArray(tmp_data), out_type, cast_options, &context)); - *out = casted_array->data()->buffers[1]; - return Status::OK(); -} - -template -Status StaticCastBuffer(const Buffer& input, const int64_t length, MemoryPool* pool, - std::shared_ptr* out) { - ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(sizeof(ToType) * length, pool)); - - auto in_values = reinterpret_cast(input.data()); - auto out_values = reinterpret_cast(result->mutable_data()); - for (int64_t i = 0; i < length; ++i) { - *out_values++ = static_cast(*in_values++); - } - *out = std::move(result); - return Status::OK(); -} - -template -void CopyStridedBytewise(int8_t* input_data, int64_t length, int64_t stride, - T* output_data) { - // Passing input_data as non-const is a concession to PyObject* - for (int64_t i = 0; i < length; ++i) { - memcpy(output_data + i, input_data, sizeof(T)); - input_data += stride; - } -} - -template -void CopyStridedNatural(T* input_data, int64_t length, int64_t stride, T* output_data) { - // Passing input_data as non-const is a concession to PyObject* - int64_t j = 0; - for (int64_t i = 0; i < length; ++i) { - output_data[i] = input_data[j]; - j += stride; - } -} - -class NumPyStridedConverter { - public: - static Status Convert(PyArrayObject* arr, int64_t length, MemoryPool* pool, - std::shared_ptr* out) { - NumPyStridedConverter converter(arr, length, pool); - RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter)); - *out = converter.buffer_; - return Status::OK(); - } - template - Status Visit(PyArrayObject* arr) { - using traits = internal::npy_traits; - using T = typename traits::value_type; - - ARROW_ASSIGN_OR_RAISE(buffer_, AllocateBuffer(sizeof(T) * length_, pool_)); - - const int64_t stride = PyArray_STRIDES(arr)[0]; - // ARROW-16013: convert sizeof(T) to signed int64 first, otherwise dividing by it - // would do an unsigned division. This cannot be caught by tests without ubsan, since - // common signed overflow behavior and the fact that the sizeof(T) is currently always - // a power of two here cause CopyStridedNatural to still produce correct results - const int64_t element_size = sizeof(T); - if (stride % element_size == 0) { - const int64_t stride_elements = stride / element_size; - CopyStridedNatural(reinterpret_cast(PyArray_DATA(arr)), length_, - stride_elements, reinterpret_cast(buffer_->mutable_data())); - } else { - CopyStridedBytewise(reinterpret_cast(PyArray_DATA(arr)), length_, stride, - reinterpret_cast(buffer_->mutable_data())); - } - return Status::OK(); - } - - protected: - NumPyStridedConverter(PyArrayObject* arr, int64_t length, MemoryPool* pool) - : arr_(arr), length_(length), pool_(pool), buffer_(nullptr) {} - PyArrayObject* arr_; - int64_t length_; - MemoryPool* pool_; - std::shared_ptr buffer_; -}; - -} // namespace - -template -inline Status NumPyConverter::PrepareInputData(std::shared_ptr* data) { - if (PyArray_ISBYTESWAPPED(arr_)) { - // TODO - return Status::NotImplemented("Byte-swapped arrays not supported"); - } - - if (dtype_->type_num == NPY_BOOL) { - int64_t nbytes = bit_util::BytesForBits(length_); - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(nbytes, pool_)); - - Ndarray1DIndexer values(arr_); - int64_t i = 0; - const auto generate = [&values, &i]() -> bool { return values[i++] > 0; }; - GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate); - - *data = std::move(buffer); - } else if (is_strided()) { - RETURN_NOT_OK(NumPyStridedConverter::Convert(arr_, length_, pool_, data)); - } else { - // Can zero-copy - *data = std::make_shared(reinterpret_cast(arr_)); - } - - return Status::OK(); -} - -template -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - RETURN_NOT_OK(PrepareInputData(data)); - - std::shared_ptr input_type; - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); - - if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, - cast_options_, pool_, data)); - } - - return Status::OK(); -} - -template <> -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - std::shared_ptr input_type; - - RETURN_NOT_OK(PrepareInputData(data)); - - auto date_dtype = reinterpret_cast(dtype_->c_metadata); - if (dtype_->type_num == NPY_DATETIME) { - // If we have inbound datetime64[D] data, this needs to be downcasted - // separately here from int64_t to int32_t, because this data is not - // supported in compute::Cast - if (date_dtype->meta.base == NPY_FR_D) { - // TODO(wesm): How pedantic do we really want to be about checking for int32 - // overflow here? - Status s = StaticCastBuffer(**data, length_, pool_, data); - RETURN_NOT_OK(s); - } else { - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); - if (!input_type->Equals(*type_)) { - // The null bitmap was already computed in VisitNative() - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, - type_, cast_options_, pool_, data)); - } - } - } else { - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); - if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, - type_, cast_options_, pool_, data)); - } - } - - return Status::OK(); -} - -template <> -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - constexpr int64_t kMillisecondsInDay = 86400000; - std::shared_ptr input_type; - - RETURN_NOT_OK(PrepareInputData(data)); - - auto date_dtype = reinterpret_cast(dtype_->c_metadata); - if (dtype_->type_num == NPY_DATETIME) { - // If we have inbound datetime64[D] data, this needs to be downcasted - // separately here from int64_t to int32_t, because this data is not - // supported in compute::Cast - if (date_dtype->meta.base == NPY_FR_D) { - ARROW_ASSIGN_OR_RAISE(auto result, - AllocateBuffer(sizeof(int64_t) * length_, pool_)); - - auto in_values = reinterpret_cast((*data)->data()); - auto out_values = reinterpret_cast(result->mutable_data()); - for (int64_t i = 0; i < length_; ++i) { - *out_values++ = kMillisecondsInDay * (*in_values++); - } - *data = std::move(result); - } else { - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); - if (!input_type->Equals(*type_)) { - // The null bitmap was already computed in VisitNative() - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, - type_, cast_options_, pool_, data)); - } - } - } else { - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); - if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, - type_, cast_options_, pool_, data)); - } - } - - return Status::OK(); -} - -// Create 16MB chunks for binary data -constexpr int32_t kBinaryChunksize = 1 << 24; - -Status NumPyConverter::Visit(const BinaryType& type) { - ::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_); - - auto data = reinterpret_cast(PyArray_DATA(arr_)); - - auto AppendNotNull = [&builder, this](const uint8_t* data) { - // This is annoying. NumPy allows strings to have nul-terminators, so - // we must check for them here - const size_t item_size = - strnlen(reinterpret_cast(data), static_cast(itemsize_)); - return builder.Append(data, static_cast(item_size)); - }; - - if (mask_ != nullptr) { - Ndarray1DIndexer mask_values(mask_); - for (int64_t i = 0; i < length_; ++i) { - if (mask_values[i]) { - RETURN_NOT_OK(builder.AppendNull()); - } else { - RETURN_NOT_OK(AppendNotNull(data)); - } - data += stride_; - } - } else { - for (int64_t i = 0; i < length_; ++i) { - RETURN_NOT_OK(AppendNotNull(data)); - data += stride_; - } - } - - ArrayVector result; - RETURN_NOT_OK(builder.Finish(&result)); - for (auto arr : result) { - RETURN_NOT_OK(PushArray(arr->data())); - } - return Status::OK(); -} - -Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { - auto byte_width = type.byte_width(); - - if (itemsize_ != byte_width) { - return Status::Invalid("Got bytestring of length ", itemsize_, " (expected ", - byte_width, ")"); - } - - FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_); - auto data = reinterpret_cast(PyArray_DATA(arr_)); - - if (mask_ != nullptr) { - Ndarray1DIndexer mask_values(mask_); - RETURN_NOT_OK(builder.Reserve(length_)); - for (int64_t i = 0; i < length_; ++i) { - if (mask_values[i]) { - RETURN_NOT_OK(builder.AppendNull()); - } else { - RETURN_NOT_OK(builder.Append(data)); - } - data += stride_; - } - } else { - for (int64_t i = 0; i < length_; ++i) { - RETURN_NOT_OK(builder.Append(data)); - data += stride_; - } - } - - std::shared_ptr result; - RETURN_NOT_OK(builder.Finish(&result)); - return PushArray(result->data()); -} - -namespace { - -// NumPy unicode is UCS4/UTF32 always -constexpr int kNumPyUnicodeSize = 4; - -Status AppendUTF32(const char* data, int itemsize, int byteorder, - ::arrow::internal::ChunkedStringBuilder* builder) { - // The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode, - // so we need to detect that here to truncate if necessary. Yep. - int actual_length = 0; - for (; actual_length < itemsize / kNumPyUnicodeSize; ++actual_length) { - const char* code_point = data + actual_length * kNumPyUnicodeSize; - if ((*code_point == '\0') && (*(code_point + 1) == '\0') && - (*(code_point + 2) == '\0') && (*(code_point + 3) == '\0')) { - break; - } - } - - OwnedRef unicode_obj(PyUnicode_DecodeUTF32(data, actual_length * kNumPyUnicodeSize, - nullptr, &byteorder)); - RETURN_IF_PYERROR(); - OwnedRef utf8_obj(PyUnicode_AsUTF8String(unicode_obj.obj())); - if (utf8_obj.obj() == NULL) { - PyErr_Clear(); - return Status::Invalid("failed converting UTF32 to UTF8"); - } - - const int32_t length = static_cast(PyBytes_GET_SIZE(utf8_obj.obj())); - return builder->Append( - reinterpret_cast(PyBytes_AS_STRING(utf8_obj.obj())), length); -} - -} // namespace - -Status NumPyConverter::Visit(const StringType& type) { - util::InitializeUTF8(); - - ::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); - - auto data = reinterpret_cast(PyArray_DATA(arr_)); - - char numpy_byteorder = dtype_->byteorder; - - // For Python C API, -1 is little-endian, 1 is big-endian -#if ARROW_LITTLE_ENDIAN - // Yield little-endian from both '|' (native) and '<' - int byteorder = numpy_byteorder == '>' ? 1 : -1; -#else - // Yield big-endian from both '|' (native) and '>' - int byteorder = numpy_byteorder == '<' ? -1 : 1; -#endif - - PyAcquireGIL gil_lock; - - const bool is_binary_type = dtype_->type_num == NPY_STRING; - const bool is_unicode_type = dtype_->type_num == NPY_UNICODE; - - if (!is_binary_type && !is_unicode_type) { - const bool is_float_type = dtype_->kind == 'f'; - if (from_pandas_ && is_float_type) { - // in case of from_pandas=True, accept an all-NaN float array as input - RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_, - &null_count_)); - if (null_count_ == length_) { - auto arr = std::make_shared(length_); - compute::ExecContext context(pool_); - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr out, - compute::Cast(*arr, arrow::utf8(), cast_options_, &context)); - out_arrays_.emplace_back(out); - return Status::OK(); - } - } - std::string dtype_string; - RETURN_NOT_OK(internal::PyObject_StdStringStr(reinterpret_cast(dtype_), - &dtype_string)); - return Status::TypeError("Expected a string or bytes dtype, got ", dtype_string); - } - - auto AppendNonNullValue = [&](const uint8_t* data) { - if (is_binary_type) { - if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) { - return builder.Append(data, itemsize_); - } else { - return Status::Invalid("Encountered non-UTF8 binary value: ", - HexEncode(data, itemsize_)); - } - } else { - // is_unicode_type case - return AppendUTF32(reinterpret_cast(data), itemsize_, byteorder, - &builder); - } - }; - - if (mask_ != nullptr) { - Ndarray1DIndexer mask_values(mask_); - for (int64_t i = 0; i < length_; ++i) { - if (mask_values[i]) { - RETURN_NOT_OK(builder.AppendNull()); - } else { - RETURN_NOT_OK(AppendNonNullValue(data)); - } - data += stride_; - } - } else { - for (int64_t i = 0; i < length_; ++i) { - RETURN_NOT_OK(AppendNonNullValue(data)); - data += stride_; - } - } - - ArrayVector result; - RETURN_NOT_OK(builder.Finish(&result)); - for (auto arr : result) { - RETURN_NOT_OK(PushArray(arr->data())); - } - return Status::OK(); -} - -Status NumPyConverter::Visit(const StructType& type) { - std::vector sub_converters; - std::vector sub_arrays; - - { - PyAcquireGIL gil_lock; - - // Create converters for each struct type field - if (dtype_->fields == NULL || !PyDict_Check(dtype_->fields)) { - return Status::TypeError("Expected struct array"); - } - - for (auto field : type.fields()) { - PyObject* tup = PyDict_GetItemString(dtype_->fields, field->name().c_str()); - if (tup == NULL) { - return Status::Invalid("Missing field '", field->name(), "' in struct array"); - } - PyArray_Descr* sub_dtype = - reinterpret_cast(PyTuple_GET_ITEM(tup, 0)); - DCHECK(PyObject_TypeCheck(sub_dtype, &PyArrayDescr_Type)); - int offset = static_cast(PyLong_AsLong(PyTuple_GET_ITEM(tup, 1))); - RETURN_IF_PYERROR(); - Py_INCREF(sub_dtype); /* PyArray_GetField() steals ref */ - PyObject* sub_array = PyArray_GetField(arr_, sub_dtype, offset); - RETURN_IF_PYERROR(); - sub_arrays.emplace_back(sub_array); - sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(), - from_pandas_); - } - } - - std::vector groups; - int64_t null_count = 0; - - // Compute null bitmap and store it as a Boolean Array to include it - // in the rechunking below - { - if (mask_ != nullptr) { - RETURN_NOT_OK(InitNullBitmap()); - null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); - if (null_count_ == -1) return Status::Invalid("Invalid mask type"); - } - groups.push_back({std::make_shared(length_, null_bitmap_)}); - } - - // Convert child data - for (auto& converter : sub_converters) { - RETURN_NOT_OK(converter.Convert()); - groups.push_back(converter.result()); - } - // Ensure the different array groups are chunked consistently - groups = ::arrow::internal::RechunkArraysConsistently(groups); - - // Make struct array chunks by combining groups - size_t ngroups = groups.size(); - size_t nchunks = groups[0].size(); - for (size_t chunk = 0; chunk < nchunks; chunk++) { - // First group has the null bitmaps as Boolean Arrays - const auto& null_data = groups[0][chunk]->data(); - DCHECK_EQ(null_data->type->id(), Type::BOOL); - DCHECK_EQ(null_data->buffers.size(), 2); - const auto& null_buffer = null_data->buffers[1]; - // Careful: the rechunked null bitmap may have a non-zero offset - // to its buffer, and it may not even start on a byte boundary - int64_t null_offset = null_data->offset; - std::shared_ptr fixed_null_buffer; - - if (!null_buffer) { - fixed_null_buffer = null_buffer; - } else if (null_offset % 8 == 0) { - fixed_null_buffer = - std::make_shared(null_buffer, - // byte offset - null_offset / 8, - // byte size - bit_util::BytesForBits(null_data->length)); - } else { - ARROW_ASSIGN_OR_RAISE( - fixed_null_buffer, - CopyBitmap(pool_, null_buffer->data(), null_offset, null_data->length)); - } - - // Create struct array chunk and populate it - auto arr_data = - ArrayData::Make(type_, null_data->length, null_count ? kUnknownNullCount : 0, 0); - arr_data->buffers.push_back(fixed_null_buffer); - // Append child chunks - for (size_t i = 1; i < ngroups; i++) { - arr_data->child_data.push_back(groups[i][chunk]->data()); - } - RETURN_NOT_OK(PushArray(arr_data)); - } - - return Status::OK(); -} - -Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, - const std::shared_ptr& type, - const compute::CastOptions& cast_options, - std::shared_ptr* out) { - if (!PyArray_Check(ao)) { - // This code path cannot be reached by Python unit tests currently so this - // is only a sanity check. - return Status::TypeError("Input object was not a NumPy array"); - } - if (PyArray_NDIM(reinterpret_cast(ao)) != 1) { - return Status::Invalid("only handle 1-dimensional arrays"); - } - - NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options); - RETURN_NOT_OK(converter.Convert()); - const auto& output_arrays = converter.result(); - DCHECK_GT(output_arrays.size(), 0); - *out = std::make_shared(output_arrays); - return Status::OK(); -} - -Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, - const std::shared_ptr& type, - std::shared_ptr* out) { - return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out); -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.h b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.h deleted file mode 100644 index b6cd093..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.h +++ /dev/null @@ -1,72 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Converting from pandas memory representation to Arrow data structures - -#pragma once - -#include "arrow/python/platform.h" - -#include - -#include "arrow/compute/api.h" -#include "arrow/python/visibility.h" - -namespace arrow { - -class Array; -class ChunkedArray; -class DataType; -class MemoryPool; -class Status; - -namespace py { - -/// Convert NumPy arrays to Arrow. If target data type is not known, pass a -/// type with null -/// -/// \param[in] pool Memory pool for any memory allocations -/// \param[in] ao an ndarray with the array data -/// \param[in] mo an ndarray with a null mask (True is null), optional -/// \param[in] from_pandas If true, use pandas's null sentinels to determine -/// whether values are null -/// \param[in] type a specific type to cast to, may be null -/// \param[in] cast_options casting options -/// \param[out] out a ChunkedArray, to accommodate chunked output -ARROW_PYTHON_EXPORT -Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, - const std::shared_ptr& type, - const compute::CastOptions& cast_options, - std::shared_ptr* out); - -/// Safely convert NumPy arrays to Arrow. If target data type is not known, -/// pass a type with null. -/// -/// \param[in] pool Memory pool for any memory allocations -/// \param[in] ao an ndarray with the array data -/// \param[in] mo an ndarray with a null mask (True is null), optional -/// \param[in] from_pandas If true, use pandas's null sentinels to determine -/// whether values are null -/// \param[in] type a specific type to cast to, may be null -/// \param[out] out a ChunkedArray, to accommodate chunked output -ARROW_PYTHON_EXPORT -Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, - const std::shared_ptr& type, - std::shared_ptr* out); - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.cc deleted file mode 100644 index a5f924b..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.cc +++ /dev/null @@ -1,98 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/parquet_encryption.h" -#include "parquet/exception.h" - -namespace arrow { -namespace py { -namespace parquet { -namespace encryption { - -PyKmsClient::PyKmsClient(PyObject* handler, PyKmsClientVtable vtable) - : handler_(handler), vtable_(std::move(vtable)) { - Py_INCREF(handler); -} - -PyKmsClient::~PyKmsClient() {} - -std::string PyKmsClient::WrapKey(const std::string& key_bytes, - const std::string& master_key_identifier) { - std::string wrapped; - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.wrap_key(handler_.obj(), key_bytes, master_key_identifier, &wrapped); - return CheckPyError(); - }); - if (!st.ok()) { - throw ::parquet::ParquetStatusException(st); - } - return wrapped; -} - -std::string PyKmsClient::UnwrapKey(const std::string& wrapped_key, - const std::string& master_key_identifier) { - std::string unwrapped; - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.unwrap_key(handler_.obj(), wrapped_key, master_key_identifier, &unwrapped); - return CheckPyError(); - }); - if (!st.ok()) { - throw ::parquet::ParquetStatusException(st); - } - return unwrapped; -} - -PyKmsClientFactory::PyKmsClientFactory(PyObject* handler, PyKmsClientFactoryVtable vtable) - : handler_(handler), vtable_(std::move(vtable)) { - Py_INCREF(handler); -} - -PyKmsClientFactory::~PyKmsClientFactory() {} - -std::shared_ptr<::parquet::encryption::KmsClient> PyKmsClientFactory::CreateKmsClient( - const ::parquet::encryption::KmsConnectionConfig& kms_connection_config) { - std::shared_ptr<::parquet::encryption::KmsClient> kms_client; - auto st = SafeCallIntoPython([&]() -> Status { - vtable_.create_kms_client(handler_.obj(), kms_connection_config, &kms_client); - return CheckPyError(); - }); - if (!st.ok()) { - throw ::parquet::ParquetStatusException(st); - } - return kms_client; -} - -arrow::Result> -PyCryptoFactory::SafeGetFileEncryptionProperties( - const ::parquet::encryption::KmsConnectionConfig& kms_connection_config, - const ::parquet::encryption::EncryptionConfiguration& encryption_config) { - PARQUET_CATCH_AND_RETURN( - this->GetFileEncryptionProperties(kms_connection_config, encryption_config)); -} - -arrow::Result> -PyCryptoFactory::SafeGetFileDecryptionProperties( - const ::parquet::encryption::KmsConnectionConfig& kms_connection_config, - const ::parquet::encryption::DecryptionConfiguration& decryption_config) { - PARQUET_CATCH_AND_RETURN( - this->GetFileDecryptionProperties(kms_connection_config, decryption_config)); -} - -} // namespace encryption -} // namespace parquet -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.h b/src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.h deleted file mode 100644 index 23ee478..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.h +++ /dev/null @@ -1,109 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/python/common.h" -#include "arrow/python/visibility.h" -#include "arrow/util/macros.h" -#include "parquet/encryption/crypto_factory.h" -#include "parquet/encryption/kms_client.h" -#include "parquet/encryption/kms_client_factory.h" - -namespace arrow { -namespace py { -namespace parquet { -namespace encryption { - -/// \brief A table of function pointers for calling from C++ into -/// Python. -class ARROW_PYTHON_EXPORT PyKmsClientVtable { - public: - std::function - wrap_key; - std::function - unwrap_key; -}; - -/// \brief A helper for KmsClient implementation in Python. -class ARROW_PYTHON_EXPORT PyKmsClient : public ::parquet::encryption::KmsClient { - public: - PyKmsClient(PyObject* handler, PyKmsClientVtable vtable); - ~PyKmsClient() override; - - std::string WrapKey(const std::string& key_bytes, - const std::string& master_key_identifier) override; - - std::string UnwrapKey(const std::string& wrapped_key, - const std::string& master_key_identifier) override; - - private: - OwnedRefNoGIL handler_; - PyKmsClientVtable vtable_; -}; - -/// \brief A table of function pointers for calling from C++ into -/// Python. -class ARROW_PYTHON_EXPORT PyKmsClientFactoryVtable { - public: - std::function* out)> - create_kms_client; -}; - -/// \brief A helper for KmsClientFactory implementation in Python. -class ARROW_PYTHON_EXPORT PyKmsClientFactory - : public ::parquet::encryption::KmsClientFactory { - public: - PyKmsClientFactory(PyObject* handler, PyKmsClientFactoryVtable vtable); - ~PyKmsClientFactory() override; - - std::shared_ptr<::parquet::encryption::KmsClient> CreateKmsClient( - const ::parquet::encryption::KmsConnectionConfig& kms_connection_config) override; - - private: - OwnedRefNoGIL handler_; - PyKmsClientFactoryVtable vtable_; -}; - -/// \brief A CryptoFactory that returns Results instead of throwing exceptions. -class ARROW_PYTHON_EXPORT PyCryptoFactory : public ::parquet::encryption::CryptoFactory { - public: - arrow::Result> - SafeGetFileEncryptionProperties( - const ::parquet::encryption::KmsConnectionConfig& kms_connection_config, - const ::parquet::encryption::EncryptionConfiguration& encryption_config); - - /// The returned FileDecryptionProperties object will use the cache inside this - /// CryptoFactory object, so please keep this - /// CryptoFactory object alive along with the returned - /// FileDecryptionProperties object. - arrow::Result> - SafeGetFileDecryptionProperties( - const ::parquet::encryption::KmsConnectionConfig& kms_connection_config, - const ::parquet::encryption::DecryptionConfiguration& decryption_config); -}; - -} // namespace encryption -} // namespace parquet -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/pch.h b/src/vendored/apache-arrow-12.0.1/arrow/python/pch.h deleted file mode 100644 index d1d688b..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/pch.h +++ /dev/null @@ -1,24 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Often-used headers, for precompiling. -// If updating this header, please make sure you check compilation speed -// before checking in. Adding headers which are not used extremely often -// may incur a slowdown, since it makes the precompiled header heavier to load. - -#include "arrow/pch.h" -#include "arrow/python/platform.h" diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/platform.h b/src/vendored/apache-arrow-12.0.1/arrow/python/platform.h deleted file mode 100644 index e71c7ac..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/platform.h +++ /dev/null @@ -1,41 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between pandas's NumPy-based data representation -// and Arrow data structures - -#pragma once - -// If PY_SSIZE_T_CLEAN is defined, argument parsing functions treat #-specifier -// to mean Py_ssize_t (defining this to suppress deprecation warning) -#define PY_SSIZE_T_CLEAN - -#include // IWYU pragma: export -#include - -// Work around C2528 error -#ifdef _MSC_VER -#if _MSC_VER >= 1900 -#undef timezone -#endif - -// https://bugs.python.org/issue36020 -// TODO(wjones127): Can remove once we drop support for CPython 3.9 -#ifdef snprintf -#undef snprintf -#endif -#endif diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.cc deleted file mode 100644 index 30d1f04..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.cc +++ /dev/null @@ -1,94 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/pyarrow.h" - -#include -#include - -#include "arrow/array.h" -#include "arrow/table.h" -#include "arrow/tensor.h" -#include "arrow/type.h" - -#include "arrow/python/common.h" -#include "arrow/python/datetime.h" -namespace { -#include "arrow/python/pyarrow_api.h" -} - -namespace arrow { -namespace py { - -static Status UnwrapError(PyObject* obj, const char* expected_type) { - return Status::TypeError("Could not unwrap ", expected_type, - " from Python object of type '", Py_TYPE(obj)->tp_name, "'"); -} - -int import_pyarrow() { -#ifdef PYPY_VERSION - PyDateTime_IMPORT; -#else - internal::InitDatetime(); -#endif - return ::import_pyarrow__lib(); -} - -#define DEFINE_WRAP_FUNCTIONS(FUNC_SUFFIX, TYPE_NAME) \ - bool is_##FUNC_SUFFIX(PyObject* obj) { return ::pyarrow_is_##FUNC_SUFFIX(obj) != 0; } \ - \ - PyObject* wrap_##FUNC_SUFFIX(const std::shared_ptr& src) { \ - return ::pyarrow_wrap_##FUNC_SUFFIX(src); \ - } \ - Result> unwrap_##FUNC_SUFFIX(PyObject* obj) { \ - auto out = ::pyarrow_unwrap_##FUNC_SUFFIX(obj); \ - if (out) { \ - return std::move(out); \ - } else { \ - return UnwrapError(obj, #TYPE_NAME); \ - } \ - } - -DEFINE_WRAP_FUNCTIONS(buffer, Buffer) - -DEFINE_WRAP_FUNCTIONS(data_type, DataType) -DEFINE_WRAP_FUNCTIONS(field, Field) -DEFINE_WRAP_FUNCTIONS(schema, Schema) - -DEFINE_WRAP_FUNCTIONS(scalar, Scalar) - -DEFINE_WRAP_FUNCTIONS(array, Array) -DEFINE_WRAP_FUNCTIONS(chunked_array, ChunkedArray) - -DEFINE_WRAP_FUNCTIONS(sparse_coo_tensor, SparseCOOTensor) -DEFINE_WRAP_FUNCTIONS(sparse_csc_matrix, SparseCSCMatrix) -DEFINE_WRAP_FUNCTIONS(sparse_csf_tensor, SparseCSFTensor) -DEFINE_WRAP_FUNCTIONS(sparse_csr_matrix, SparseCSRMatrix) -DEFINE_WRAP_FUNCTIONS(tensor, Tensor) - -DEFINE_WRAP_FUNCTIONS(batch, RecordBatch) -DEFINE_WRAP_FUNCTIONS(table, Table) - -#undef DEFINE_WRAP_FUNCTIONS - -namespace internal { - -int check_status(const Status& status) { return ::pyarrow_internal_check_status(status); } - -} // namespace internal -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.h b/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.h deleted file mode 100644 index 4c36508..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.h +++ /dev/null @@ -1,84 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/python/platform.h" - -#include - -#include "arrow/python/visibility.h" - -#include "arrow/sparse_tensor.h" - -// Work around ARROW-2317 (C linkage warning from Cython) -extern "C++" { - -namespace arrow { - -class Array; -class Buffer; -class DataType; -class Field; -class RecordBatch; -class Schema; -class Status; -class Table; -class Tensor; - -namespace py { - -// Returns 0 on success, -1 on error. -ARROW_PYTHON_EXPORT int import_pyarrow(); - -#define DECLARE_WRAP_FUNCTIONS(FUNC_SUFFIX, TYPE_NAME) \ - ARROW_PYTHON_EXPORT bool is_##FUNC_SUFFIX(PyObject*); \ - ARROW_PYTHON_EXPORT Result> unwrap_##FUNC_SUFFIX( \ - PyObject*); \ - ARROW_PYTHON_EXPORT PyObject* wrap_##FUNC_SUFFIX(const std::shared_ptr&); - -DECLARE_WRAP_FUNCTIONS(buffer, Buffer) - -DECLARE_WRAP_FUNCTIONS(data_type, DataType) -DECLARE_WRAP_FUNCTIONS(field, Field) -DECLARE_WRAP_FUNCTIONS(schema, Schema) - -DECLARE_WRAP_FUNCTIONS(scalar, Scalar) - -DECLARE_WRAP_FUNCTIONS(array, Array) -DECLARE_WRAP_FUNCTIONS(chunked_array, ChunkedArray) - -DECLARE_WRAP_FUNCTIONS(sparse_coo_tensor, SparseCOOTensor) -DECLARE_WRAP_FUNCTIONS(sparse_csc_matrix, SparseCSCMatrix) -DECLARE_WRAP_FUNCTIONS(sparse_csf_tensor, SparseCSFTensor) -DECLARE_WRAP_FUNCTIONS(sparse_csr_matrix, SparseCSRMatrix) -DECLARE_WRAP_FUNCTIONS(tensor, Tensor) - -DECLARE_WRAP_FUNCTIONS(batch, RecordBatch) -DECLARE_WRAP_FUNCTIONS(table, Table) - -#undef DECLARE_WRAP_FUNCTIONS - -namespace internal { - -ARROW_PYTHON_EXPORT int check_status(const Status& status); - -} // namespace internal -} // namespace py -} // namespace arrow - -} // extern "C++" diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow_api.h b/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow_api.h deleted file mode 100644 index a476e55..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow_api.h +++ /dev/null @@ -1,19 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// For backward compatibility. -#include "arrow/python/lib_api.h" diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow_lib.h b/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow_lib.h deleted file mode 100644 index e509593..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow_lib.h +++ /dev/null @@ -1,19 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// For backward compatibility. -#include "arrow/python/lib.h" diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/python_test.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/python_test.cc deleted file mode 100644 index 01ab8a3..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/python_test.cc +++ /dev/null @@ -1,888 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include - -#include "platform.h" - -#include "arrow/array.h" -#include "arrow/array/builder_binary.h" -#include "arrow/table.h" -#include "arrow/util/decimal.h" -#include "arrow/util/logging.h" - -#include "arrow/python/arrow_to_pandas.h" -#include "arrow/python/decimal.h" -#include "arrow/python/helpers.h" -#include "arrow/python/numpy_convert.h" -#include "arrow/python/numpy_interop.h" -#include "arrow/python/python_test.h" -#include "arrow/python/python_to_arrow.h" - -#define ASSERT_EQ(x, y) \ - { \ - auto&& _left = (x); \ - auto&& _right = (y); \ - if (_left != _right) { \ - return Status::Invalid("Expected equality between `", #x, "` and `", #y, \ - "`, but ", arrow::py::testing::ToString(_left), \ - " != ", arrow::py::testing::ToString(_right)); \ - } \ - } - -#define ASSERT_NE(x, y) \ - { \ - auto&& _left = (x); \ - auto&& _right = (y); \ - if (_left == _right) { \ - return Status::Invalid("Expected inequality between `", #x, "` and `", #y, \ - "`, but ", arrow::py::testing::ToString(_left), \ - " == ", arrow::py::testing::ToString(_right)); \ - } \ - } - -#define ASSERT_FALSE(v) \ - { \ - auto&& _v = (v); \ - if (!!_v) { \ - return Status::Invalid("Expected `", #v, "` to evaluate to false, but got ", \ - arrow::py::testing::ToString(_v)); \ - } \ - } - -#define ASSERT_TRUE(v) \ - { \ - auto&& _v = (v); \ - if (!_v) { \ - return Status::Invalid("Expected `", #v, "` to evaluate to true, but got ", \ - arrow::py::testing::ToString(_v)); \ - } \ - } - -#define ASSERT_FALSE_MSG(v, msg) \ - { \ - auto&& _v = (v); \ - if (!!_v) { \ - return Status::Invalid("Expected `", #v, "` to evaluate to false, but got ", \ - arrow::py::testing::ToString(_v), ": ", msg); \ - } \ - } - -#define ASSERT_TRUE_MSG(v, msg) \ - { \ - auto&& _v = (v); \ - if (!_v) { \ - return Status::Invalid("Expected `", #v, "` to evaluate to true, but got ", \ - arrow::py::testing::ToString(_v), ": ", msg); \ - } \ - } - -#define ASSERT_OK(expr) \ - { \ - for (::arrow::Status _st = ::arrow::internal::GenericToStatus((expr)); !_st.ok();) \ - return Status::Invalid("`", #expr, "` failed with ", _st.ToString()); \ - } - -#define ASSERT_RAISES(code, expr) \ - { \ - for (::arrow::Status _st_expr = ::arrow::internal::GenericToStatus((expr)); \ - !_st_expr.Is##code();) \ - return Status::Invalid("Expected `", #expr, "` to fail with ", #code, \ - ", but got ", _st_expr.ToString()); \ - } - -namespace arrow { - -using internal::checked_cast; - -namespace py { -namespace testing { - -// ARROW-17938: Some standard libraries have ambiguous operator<<(nullptr_t), -// work around it using a custom printer function. - -template -std::string ToString(const T& t) { - std::stringstream ss; - ss << t; - return ss.str(); -} - -template <> -std::string ToString(const std::nullptr_t&) { - return "nullptr"; -} - -namespace { - -Status TestOwnedRefMoves() { - std::vector vec; - PyObject *u, *v; - u = PyList_New(0); - v = PyList_New(0); - - { - OwnedRef ref(u); - vec.push_back(std::move(ref)); - ASSERT_EQ(ref.obj(), nullptr); - } - vec.emplace_back(v); - ASSERT_EQ(Py_REFCNT(u), 1); - ASSERT_EQ(Py_REFCNT(v), 1); - return Status::OK(); -} - -Status TestOwnedRefNoGILMoves() { - PyAcquireGIL lock; - lock.release(); - - { - std::vector vec; - PyObject *u, *v; - { - lock.acquire(); - u = PyList_New(0); - v = PyList_New(0); - lock.release(); - } - { - OwnedRefNoGIL ref(u); - vec.push_back(std::move(ref)); - ASSERT_EQ(ref.obj(), nullptr); - } - vec.emplace_back(v); - ASSERT_EQ(Py_REFCNT(u), 1); - ASSERT_EQ(Py_REFCNT(v), 1); - return Status::OK(); - } -} - -std::string FormatPythonException(const std::string& exc_class_name) { - std::stringstream ss; - ss << "Python exception: "; - ss << exc_class_name; - return ss.str(); -} - -Status TestCheckPyErrorStatus() { - Status st; - std::string expected_detail = ""; - - auto check_error = [](Status& st, const char* expected_message = "some error", - std::string expected_detail = "") { - st = CheckPyError(); - ASSERT_EQ(st.message(), expected_message); - ASSERT_FALSE(PyErr_Occurred()); - if (expected_detail.size() > 0) { - auto detail = st.detail(); - ASSERT_NE(detail, nullptr); - ASSERT_EQ(detail->ToString(), expected_detail); - } - return Status::OK(); - }; - - for (PyObject* exc_type : {PyExc_Exception, PyExc_SyntaxError}) { - PyErr_SetString(exc_type, "some error"); - ASSERT_OK(check_error(st)); - ASSERT_TRUE(st.IsUnknownError()); - } - - PyErr_SetString(PyExc_TypeError, "some error"); - ASSERT_OK(check_error(st, "some error", FormatPythonException("TypeError"))); - ASSERT_TRUE(st.IsTypeError()); - - PyErr_SetString(PyExc_ValueError, "some error"); - ASSERT_OK(check_error(st)); - ASSERT_TRUE(st.IsInvalid()); - - PyErr_SetString(PyExc_KeyError, "some error"); - ASSERT_OK(check_error(st, "'some error'")); - ASSERT_TRUE(st.IsKeyError()); - - for (PyObject* exc_type : {PyExc_OSError, PyExc_IOError}) { - PyErr_SetString(exc_type, "some error"); - ASSERT_OK(check_error(st)); - ASSERT_TRUE(st.IsIOError()); - } - - PyErr_SetString(PyExc_NotImplementedError, "some error"); - ASSERT_OK(check_error(st, "some error", FormatPythonException("NotImplementedError"))); - ASSERT_TRUE(st.IsNotImplemented()); - - // No override if a specific status code is given - PyErr_SetString(PyExc_TypeError, "some error"); - st = CheckPyError(StatusCode::SerializationError); - ASSERT_TRUE(st.IsSerializationError()); - ASSERT_EQ(st.message(), "some error"); - ASSERT_FALSE(PyErr_Occurred()); - - return Status::OK(); -} - -Status TestCheckPyErrorStatusNoGIL() { - PyAcquireGIL lock; - { - Status st; - PyErr_SetString(PyExc_ZeroDivisionError, "zzzt"); - st = ConvertPyError(); - ASSERT_FALSE(PyErr_Occurred()); - lock.release(); - ASSERT_TRUE(st.IsUnknownError()); - ASSERT_EQ(st.message(), "zzzt"); - ASSERT_EQ(st.detail()->ToString(), FormatPythonException("ZeroDivisionError")); - return Status::OK(); - } -} - -Status TestRestorePyErrorBasics() { - PyErr_SetString(PyExc_ZeroDivisionError, "zzzt"); - auto st = ConvertPyError(); - ASSERT_FALSE(PyErr_Occurred()); - ASSERT_TRUE(st.IsUnknownError()); - ASSERT_EQ(st.message(), "zzzt"); - ASSERT_EQ(st.detail()->ToString(), FormatPythonException("ZeroDivisionError")); - - RestorePyError(st); - ASSERT_TRUE(PyErr_Occurred()); - PyObject* exc_type; - PyObject* exc_value; - PyObject* exc_traceback; - PyErr_Fetch(&exc_type, &exc_value, &exc_traceback); - ASSERT_TRUE(PyErr_GivenExceptionMatches(exc_type, PyExc_ZeroDivisionError)); - std::string py_message; - ASSERT_OK(internal::PyObject_StdStringStr(exc_value, &py_message)); - ASSERT_EQ(py_message, "zzzt"); - - return Status::OK(); -} - -Status TestPyBufferInvalidInputObject() { - std::shared_ptr res; - PyObject* input = Py_None; - auto old_refcnt = Py_REFCNT(input); - { - Status st = PyBuffer::FromPyObject(input).status(); - ASSERT_TRUE_MSG(IsPyError(st), st.ToString()); - ASSERT_FALSE(PyErr_Occurred()); - } - ASSERT_EQ(old_refcnt, Py_REFCNT(input)); - return Status::OK(); -} - -// Because of how it is declared, the Numpy C API instance initialized -// within libarrow_python.dll may not be visible in this test under Windows -// ("unresolved external symbol arrow_ARRAY_API referenced"). -#ifndef _WIN32 -Status TestPyBufferNumpyArray() { - npy_intp dims[1] = {10}; - - OwnedRef arr_ref(PyArray_SimpleNew(1, dims, NPY_FLOAT)); - PyObject* arr = arr_ref.obj(); - ASSERT_NE(arr, nullptr); - auto old_refcnt = Py_REFCNT(arr); - auto buf = std::move(PyBuffer::FromPyObject(arr)).ValueOrDie(); - - ASSERT_TRUE(buf->is_cpu()); - ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast(arr))); - ASSERT_TRUE(buf->is_mutable()); - ASSERT_EQ(buf->mutable_data(), buf->data()); - ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr)); - buf.reset(); - ASSERT_EQ(old_refcnt, Py_REFCNT(arr)); - - // Read-only - PyArray_CLEARFLAGS(reinterpret_cast(arr), NPY_ARRAY_WRITEABLE); - buf = std::move(PyBuffer::FromPyObject(arr)).ValueOrDie(); - ASSERT_TRUE(buf->is_cpu()); - ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast(arr))); - ASSERT_FALSE(buf->is_mutable()); - ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr)); - buf.reset(); - ASSERT_EQ(old_refcnt, Py_REFCNT(arr)); - - return Status::OK(); -} - -Status TestNumPyBufferNumpyArray() { - npy_intp dims[1] = {10}; - - OwnedRef arr_ref(PyArray_SimpleNew(1, dims, NPY_FLOAT)); - PyObject* arr = arr_ref.obj(); - ASSERT_NE(arr, nullptr); - auto old_refcnt = Py_REFCNT(arr); - - auto buf = std::make_shared(arr); - ASSERT_TRUE(buf->is_cpu()); - ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast(arr))); - ASSERT_TRUE(buf->is_mutable()); - ASSERT_EQ(buf->mutable_data(), buf->data()); - ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr)); - buf.reset(); - ASSERT_EQ(old_refcnt, Py_REFCNT(arr)); - - // Read-only - PyArray_CLEARFLAGS(reinterpret_cast(arr), NPY_ARRAY_WRITEABLE); - buf = std::make_shared(arr); - ASSERT_TRUE(buf->is_cpu()); - ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast(arr))); - ASSERT_FALSE(buf->is_mutable()); - ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr)); - buf.reset(); - ASSERT_EQ(old_refcnt, Py_REFCNT(arr)); - - return Status::OK(); -} -#endif - -Status TestPythonDecimalToString() { - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string("-39402950693754869342983"); - PyObject* python_object = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - ASSERT_NE(python_object, nullptr); - - std::string string_result; - ASSERT_OK(internal::PythonDecimalToString(python_object, &string_result)); - - return Status::OK(); -} - -Status TestInferPrecisionAndScale() { - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string("-394029506937548693.42983"); - PyObject* python_decimal = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - - internal::DecimalMetadata metadata; - ASSERT_OK(metadata.Update(python_decimal)); - - const auto expected_precision = - static_cast(decimal_string.size() - 2); // 1 for -, 1 for . - const int32_t expected_scale = 5; - - ASSERT_EQ(expected_precision, metadata.precision()); - ASSERT_EQ(expected_scale, metadata.scale()); - - return Status::OK(); -} - -Status TestInferPrecisionAndNegativeScale() { - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string("-3.94042983E+10"); - PyObject* python_decimal = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - - internal::DecimalMetadata metadata; - ASSERT_OK(metadata.Update(python_decimal)); - - const auto expected_precision = 11; - const int32_t expected_scale = 0; - - ASSERT_EQ(expected_precision, metadata.precision()); - ASSERT_EQ(expected_scale, metadata.scale()); - - return Status::OK(); -} - -Status TestInferAllLeadingZeros() { - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string("0.001"); - PyObject* python_decimal = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - - internal::DecimalMetadata metadata; - ASSERT_OK(metadata.Update(python_decimal)); - ASSERT_EQ(3, metadata.precision()); - ASSERT_EQ(3, metadata.scale()); - - return Status::OK(); -} - -Status TestInferAllLeadingZerosExponentialNotationPositive() { - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string("0.01E5"); - PyObject* python_decimal = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - - internal::DecimalMetadata metadata; - ASSERT_OK(metadata.Update(python_decimal)); - ASSERT_EQ(4, metadata.precision()); - ASSERT_EQ(0, metadata.scale()); - - return Status::OK(); -} - -Status TestInferAllLeadingZerosExponentialNotationNegative() { - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string("0.01E3"); - PyObject* python_decimal = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - internal::DecimalMetadata metadata; - ASSERT_OK(metadata.Update(python_decimal)); - ASSERT_EQ(2, metadata.precision()); - ASSERT_EQ(0, metadata.scale()); - - return Status::OK(); -} - -Status TestObjectBlockWriteFails() { - StringBuilder builder; - const char value[] = {'\xf1', '\0'}; - - for (int i = 0; i < 1000; ++i) { - ASSERT_OK(builder.Append(value, static_cast(strlen(value)))); - } - - std::shared_ptr arr; - ASSERT_OK(builder.Finish(&arr)); - - auto f1 = field("f1", utf8()); - auto f2 = field("f2", utf8()); - auto f3 = field("f3", utf8()); - std::vector> fields = {f1, f2, f3}; - std::vector> cols = {arr, arr, arr}; - - auto schema = ::arrow::schema(fields); - auto table = Table::Make(schema, cols); - - Status st; - Py_BEGIN_ALLOW_THREADS; - PyObject* out; - PandasOptions options; - options.use_threads = true; - st = ConvertTableToPandas(options, table, &out); - Py_END_ALLOW_THREADS; - ASSERT_RAISES(UnknownError, st); - - return Status::OK(); -} - -Status TestMixedTypeFails() { - OwnedRef list_ref(PyList_New(3)); - PyObject* list = list_ref.obj(); - - ASSERT_NE(list, nullptr); - - PyObject* str = PyUnicode_FromString("abc"); - ASSERT_NE(str, nullptr); - - PyObject* integer = PyLong_FromLong(1234L); - ASSERT_NE(integer, nullptr); - - PyObject* doub = PyFloat_FromDouble(123.0234); - ASSERT_NE(doub, nullptr); - - // This steals a reference to each object, so we don't need to decref them later - // just the list - ASSERT_EQ(PyList_SetItem(list, 0, str), 0); - ASSERT_EQ(PyList_SetItem(list, 1, integer), 0); - ASSERT_EQ(PyList_SetItem(list, 2, doub), 0); - - ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, {})); - - return Status::OK(); -} - -template -Status DecimalTestFromPythonDecimalRescale(std::shared_ptr type, - PyObject* python_decimal, - std::optional expected) { - DecimalValue value; - const auto& decimal_type = checked_cast(*type); - - if (expected.has_value()) { - ASSERT_OK(internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value)); - ASSERT_EQ(expected.value(), value); - - ASSERT_OK(internal::DecimalFromPyObject(python_decimal, decimal_type, &value)); - ASSERT_EQ(expected.value(), value); - } else { - ASSERT_RAISES(Invalid, internal::DecimalFromPythonDecimal(python_decimal, - decimal_type, &value)); - ASSERT_RAISES(Invalid, - internal::DecimalFromPyObject(python_decimal, decimal_type, &value)); - } - return Status::OK(); -} - -Status TestFromPythonDecimalRescaleNotTruncateable() { - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string("1.001"); - PyObject* python_decimal = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - // We fail when truncating values that would lose data if cast to a decimal type with - // lower scale - ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal128(10, 2), - python_decimal, {})); - ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal256(10, 2), - python_decimal, {})); - - return Status::OK(); -} - -Status TestFromPythonDecimalRescaleTruncateable() { - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string("1.000"); - PyObject* python_decimal = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - // We allow truncation of values that do not lose precision when dividing by 10 * the - // difference between the scales, e.g., 1.000 -> 1.00 - ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal128(10, 2), - python_decimal, 100)); - ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal256(10, 2), - python_decimal, 100)); - - return Status::OK(); -} - -Status TestFromPythonNegativeDecimalRescale() { - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string("-1.000"); - PyObject* python_decimal = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal128(10, 9), - python_decimal, -1000000000)); - ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal256(10, 9), - python_decimal, -1000000000)); - - return Status::OK(); -} - -Status TestDecimal128FromPythonInteger() { - Decimal128 value; - OwnedRef python_long(PyLong_FromLong(42)); - auto type = ::arrow::decimal128(10, 2); - const auto& decimal_type = checked_cast(*type); - ASSERT_OK(internal::DecimalFromPyObject(python_long.obj(), decimal_type, &value)); - ASSERT_EQ(4200, value); - return Status::OK(); -} - -Status TestDecimal256FromPythonInteger() { - Decimal256 value; - OwnedRef python_long(PyLong_FromLong(42)); - auto type = ::arrow::decimal256(10, 2); - const auto& decimal_type = checked_cast(*type); - ASSERT_OK(internal::DecimalFromPyObject(python_long.obj(), decimal_type, &value)); - ASSERT_EQ(4200, value); - return Status::OK(); -} - -Status TestDecimal128OverflowFails() { - Decimal128 value; - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string("9999999999999999999999999999999999999.9"); - PyObject* python_decimal = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - internal::DecimalMetadata metadata; - ASSERT_OK(metadata.Update(python_decimal)); - ASSERT_EQ(38, metadata.precision()); - ASSERT_EQ(1, metadata.scale()); - - auto type = ::arrow::decimal(38, 38); - const auto& decimal_type = checked_cast(*type); - ASSERT_RAISES(Invalid, - internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value)); - return Status::OK(); -} - -Status TestDecimal256OverflowFails() { - Decimal256 value; - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string( - "999999999999999999999999999999999999999999999999999999999999999999999999999.9"); - PyObject* python_decimal = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - - internal::DecimalMetadata metadata; - ASSERT_OK(metadata.Update(python_decimal)); - ASSERT_EQ(76, metadata.precision()); - ASSERT_EQ(1, metadata.scale()); - - auto type = ::arrow::decimal(76, 76); - const auto& decimal_type = checked_cast(*type); - ASSERT_RAISES(Invalid, - internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value)); - return Status::OK(); -} - -Status TestNoneAndNaN() { - OwnedRef list_ref(PyList_New(4)); - PyObject* list = list_ref.obj(); - - ASSERT_NE(list, nullptr); - - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - PyObject* constructor = decimal_constructor_.obj(); - PyObject* decimal_value = internal::DecimalFromString(constructor, "1.234"); - ASSERT_NE(decimal_value, nullptr); - - Py_INCREF(Py_None); - PyObject* missing_value1 = Py_None; - ASSERT_NE(missing_value1, nullptr); - - PyObject* missing_value2 = PyFloat_FromDouble(NPY_NAN); - ASSERT_NE(missing_value2, nullptr); - - PyObject* missing_value3 = internal::DecimalFromString(constructor, "nan"); - ASSERT_NE(missing_value3, nullptr); - - // This steals a reference to each object, so we don't need to decref them later, - // just the list - ASSERT_EQ(0, PyList_SetItem(list, 0, decimal_value)); - ASSERT_EQ(0, PyList_SetItem(list, 1, missing_value1)); - ASSERT_EQ(0, PyList_SetItem(list, 2, missing_value2)); - ASSERT_EQ(0, PyList_SetItem(list, 3, missing_value3)); - - PyConversionOptions options; - ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, options)); - - options.from_pandas = true; - auto chunked = std::move(ConvertPySequence(list, nullptr, options)).ValueOrDie(); - ASSERT_EQ(chunked->num_chunks(), 1); - - auto arr = chunked->chunk(0); - ASSERT_TRUE(arr->IsValid(0)); - ASSERT_TRUE(arr->IsNull(1)); - ASSERT_TRUE(arr->IsNull(2)); - ASSERT_TRUE(arr->IsNull(3)); - - return Status::OK(); -} - -Status TestMixedPrecisionAndScale() { - std::vector strings{{"0.001", "1.01E5", "1.01E5"}}; - - OwnedRef list_ref(PyList_New(static_cast(strings.size()))); - PyObject* list = list_ref.obj(); - - ASSERT_NE(list, nullptr); - - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - // PyList_SetItem steals a reference to the item so we don't decref it later - PyObject* decimal_constructor = decimal_constructor_.obj(); - for (Py_ssize_t i = 0; i < static_cast(strings.size()); ++i) { - const int result = PyList_SetItem( - list, i, internal::DecimalFromString(decimal_constructor, strings.at(i))); - ASSERT_EQ(0, result); - } - - auto arr = std::move(ConvertPySequence(list, nullptr, {})).ValueOrDie(); - const auto& type = checked_cast(*arr->type()); - - int32_t expected_precision = 9; - int32_t expected_scale = 3; - ASSERT_EQ(expected_precision, type.precision()); - ASSERT_EQ(expected_scale, type.scale()); - - return Status::OK(); -} - -Status TestMixedPrecisionAndScaleSequenceConvert() { - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string_1("0.01"); - PyObject* value1 = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string_1); - ASSERT_NE(value1, nullptr); - - std::string decimal_string_2("0.001"); - PyObject* value2 = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string_2); - ASSERT_NE(value2, nullptr); - - OwnedRef list_ref(PyList_New(2)); - PyObject* list = list_ref.obj(); - - // This steals a reference to each object, so we don't need to decref them later - // just the list - ASSERT_EQ(PyList_SetItem(list, 0, value1), 0); - ASSERT_EQ(PyList_SetItem(list, 1, value2), 0); - - auto arr = std::move(ConvertPySequence(list, nullptr, {})).ValueOrDie(); - const auto& type = checked_cast(*arr->type()); - ASSERT_EQ(3, type.precision()); - ASSERT_EQ(3, type.scale()); - - return Status::OK(); -} - -Status TestSimpleInference() { - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - - std::string decimal_string("0.01"); - PyObject* value = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - ASSERT_NE(value, nullptr); - internal::DecimalMetadata metadata; - ASSERT_OK(metadata.Update(value)); - ASSERT_EQ(2, metadata.precision()); - ASSERT_EQ(2, metadata.scale()); - - return Status::OK(); -} - -Status TestUpdateWithNaN() { - internal::DecimalMetadata metadata; - OwnedRef decimal_constructor_; - OwnedRef decimal_module; - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); - RETURN_NOT_OK( - internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); - std::string decimal_string("nan"); - PyObject* nan_value = - internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); - - ASSERT_OK(metadata.Update(nan_value)); - ASSERT_EQ(std::numeric_limits::min(), metadata.precision()); - ASSERT_EQ(std::numeric_limits::min(), metadata.scale()); - - return Status::OK(); -} - -} // namespace - -std::vector GetCppTestCases() { - return { - {"test_owned_ref_moves", TestOwnedRefMoves}, - {"test_owned_ref_nogil_moves", TestOwnedRefNoGILMoves}, - {"test_check_pyerror_status", TestCheckPyErrorStatus}, - {"test_check_pyerror_status_nogil", TestCheckPyErrorStatusNoGIL}, - {"test_restore_pyerror_basics", TestRestorePyErrorBasics}, - {"test_pybuffer_invalid_input_object", TestPyBufferInvalidInputObject}, -#ifndef _WIN32 - {"test_pybuffer_numpy_array", TestPyBufferNumpyArray}, - {"test_numpybuffer_numpy_array", TestNumPyBufferNumpyArray}, -#endif - {"test_python_decimal_to_string", TestPythonDecimalToString}, - {"test_infer_precision_and_scale", TestInferPrecisionAndScale}, - {"test_infer_precision_and_negative_scale", TestInferPrecisionAndNegativeScale}, - {"test_infer_all_leading_zeros", TestInferAllLeadingZeros}, - {"test_infer_all_leading_zeros_exponential_notation_positive", - TestInferAllLeadingZerosExponentialNotationPositive}, - {"test_infer_all_leading_zeros_exponential_notation_negative", - TestInferAllLeadingZerosExponentialNotationNegative}, - {"test_object_block_write_fails", TestObjectBlockWriteFails}, - {"test_mixed_type_fails", TestMixedTypeFails}, - {"test_from_python_decimal_rescale_not_truncateable", - TestFromPythonDecimalRescaleNotTruncateable}, - {"test_from_python_decimal_rescale_truncateable", - TestFromPythonDecimalRescaleTruncateable}, - {"test_from_python_negative_decimal_rescale", TestFromPythonNegativeDecimalRescale}, - {"test_decimal128_from_python_integer", TestDecimal128FromPythonInteger}, - {"test_decimal256_from_python_integer", TestDecimal256FromPythonInteger}, - {"test_decimal128_overflow_fails", TestDecimal128OverflowFails}, - {"test_decimal256_overflow_fails", TestDecimal256OverflowFails}, - {"test_none_and_nan", TestNoneAndNaN}, - {"test_mixed_precision_and_scale", TestMixedPrecisionAndScale}, - {"test_mixed_precision_and_scale_sequence_convert", - TestMixedPrecisionAndScaleSequenceConvert}, - {"test_simple_inference", TestSimpleInference}, - {"test_update_with_nan", TestUpdateWithNaN}, - }; -} - -} // namespace testing -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/python_test.h b/src/vendored/apache-arrow-12.0.1/arrow/python/python_test.h deleted file mode 100644 index c2eb62f..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/python_test.h +++ /dev/null @@ -1,42 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/status.h" - -#include "arrow/python/visibility.h" - -namespace arrow { -namespace py { -namespace testing { - -struct TestCase { - std::string name; - std::function func; -}; - -ARROW_PYTHON_EXPORT -std::vector GetCppTestCases(); - -} // namespace testing -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.cc deleted file mode 100644 index 486bd84..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.cc +++ /dev/null @@ -1,1240 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/python_to_arrow.h" -#include "arrow/python/numpy_interop.h" - -#include - -#include -#include -#include -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/array/builder_base.h" -#include "arrow/array/builder_binary.h" -#include "arrow/array/builder_decimal.h" -#include "arrow/array/builder_dict.h" -#include "arrow/array/builder_nested.h" -#include "arrow/array/builder_primitive.h" -#include "arrow/array/builder_time.h" -#include "arrow/chunked_array.h" -#include "arrow/result.h" -#include "arrow/scalar.h" -#include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/converter.h" -#include "arrow/util/decimal.h" -#include "arrow/util/int_util_overflow.h" -#include "arrow/util/logging.h" - -#include "arrow/python/datetime.h" -#include "arrow/python/decimal.h" -#include "arrow/python/helpers.h" -#include "arrow/python/inference.h" -#include "arrow/python/iterators.h" -#include "arrow/python/numpy_convert.h" -#include "arrow/python/type_traits.h" -#include "arrow/visit_type_inline.h" - -namespace arrow { - -using internal::checked_cast; -using internal::checked_pointer_cast; - -using internal::Converter; -using internal::DictionaryConverter; -using internal::ListConverter; -using internal::PrimitiveConverter; -using internal::StructConverter; - -using internal::MakeChunker; -using internal::MakeConverter; - -namespace py { - -namespace { -enum class MonthDayNanoField { kMonths, kWeeksAndDays, kDaysOnly, kNanoseconds }; - -template -struct MonthDayNanoTraits; - -struct MonthDayNanoAttrData { - const char* name; - const int64_t multiplier; -}; - -template <> -struct MonthDayNanoTraits { - using c_type = int32_t; - static const MonthDayNanoAttrData attrs[]; -}; - -const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = { - {"years", 1}, {"months", /*months_in_year=*/12}, {nullptr, 0}}; - -template <> -struct MonthDayNanoTraits { - using c_type = int32_t; - static const MonthDayNanoAttrData attrs[]; -}; - -const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = - {{"weeks", 1}, {"days", /*days_in_week=*/7}, {nullptr, 0}}; - -template <> -struct MonthDayNanoTraits { - using c_type = int32_t; - static const MonthDayNanoAttrData attrs[]; -}; - -const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = { - {"days", 1}, {nullptr, 0}}; - -template <> -struct MonthDayNanoTraits { - using c_type = int64_t; - static const MonthDayNanoAttrData attrs[]; -}; - -const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = - {{"hours", 1}, - {"minutes", /*minutes_in_hours=*/60}, - {"seconds", /*seconds_in_minute=*/60}, - {"milliseconds", /*milliseconds_in_seconds*/ 1000}, - {"microseconds", /*microseconds_in_millseconds=*/1000}, - {"nanoseconds", /*nanoseconds_in_microseconds=*/1000}, - {nullptr, 0}}; - -template -struct PopulateMonthDayNano { - using Traits = MonthDayNanoTraits; - using field_c_type = typename Traits::c_type; - - static Status Field(PyObject* obj, field_c_type* out, bool* found_attrs) { - *out = 0; - for (const MonthDayNanoAttrData* attr = &Traits::attrs[0]; attr->multiplier != 0; - ++attr) { - if (attr->multiplier != 1 && - ::arrow::internal::MultiplyWithOverflow( - static_cast(attr->multiplier), *out, out)) { - return Status::Invalid("Overflow on: ", (attr - 1)->name, - " for: ", internal::PyObject_StdStringRepr(obj)); - } - - OwnedRef field_value(PyObject_GetAttrString(obj, attr->name)); - if (field_value.obj() == nullptr) { - // No attribute present, skip to the next one. - PyErr_Clear(); - continue; - } - RETURN_IF_PYERROR(); - *found_attrs = true; - field_c_type value; - RETURN_NOT_OK(internal::CIntFromPython(field_value.obj(), &value, attr->name)); - if (::arrow::internal::AddWithOverflow(*out, value, out)) { - return Status::Invalid("Overflow on: ", attr->name, - " for: ", internal::PyObject_StdStringRepr(obj)); - } - } - - return Status::OK(); - } -}; - -// Utility for converting single python objects to their intermediate C representations -// which can be fed to the typed builders -class PyValue { - public: - // Type aliases for shorter signature definitions - using I = PyObject*; - using O = PyConversionOptions; - - // Used for null checking before actually converting the values - static bool IsNull(const O& options, I obj) { - if (options.from_pandas) { - return internal::PandasObjectIsNull(obj); - } else { - return obj == Py_None; - } - } - - // Used for post-conversion numpy NaT sentinel checking - static bool IsNaT(const TimestampType*, int64_t value) { - return internal::npy_traits::isnull(value); - } - - // Used for post-conversion numpy NaT sentinel checking - static bool IsNaT(const DurationType*, int64_t value) { - return internal::npy_traits::isnull(value); - } - - static Result Convert(const NullType*, const O&, I obj) { - if (obj == Py_None) { - return nullptr; - } else { - return Status::Invalid("Invalid null value"); - } - } - - static Result Convert(const BooleanType*, const O&, I obj) { - if (obj == Py_True) { - return true; - } else if (obj == Py_False) { - return false; - } else if (PyArray_IsScalar(obj, Bool)) { - return reinterpret_cast(obj)->obval == NPY_TRUE; - } else { - return internal::InvalidValue(obj, "tried to convert to boolean"); - } - } - - template - static enable_if_integer> Convert(const T* type, const O&, - I obj) { - typename T::c_type value; - auto status = internal::CIntFromPython(obj, &value); - if (ARROW_PREDICT_TRUE(status.ok())) { - return value; - } else if (!internal::PyIntScalar_Check(obj)) { - std::stringstream ss; - ss << "tried to convert to " << type->ToString(); - return internal::InvalidValue(obj, ss.str()); - } else { - return status; - } - } - - static Result Convert(const HalfFloatType*, const O&, I obj) { - uint16_t value; - RETURN_NOT_OK(PyFloat_AsHalf(obj, &value)); - return value; - } - - static Result Convert(const FloatType*, const O&, I obj) { - float value; - if (internal::PyFloatScalar_Check(obj)) { - value = static_cast(PyFloat_AsDouble(obj)); - RETURN_IF_PYERROR(); - } else if (internal::PyIntScalar_Check(obj)) { - RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &value)); - } else { - return internal::InvalidValue(obj, "tried to convert to float32"); - } - return value; - } - - static Result Convert(const DoubleType*, const O&, I obj) { - double value; - if (PyFloat_Check(obj)) { - value = PyFloat_AS_DOUBLE(obj); - } else if (internal::PyFloatScalar_Check(obj)) { - // Other kinds of float-y things - value = PyFloat_AsDouble(obj); - RETURN_IF_PYERROR(); - } else if (internal::PyIntScalar_Check(obj)) { - RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &value)); - } else { - return internal::InvalidValue(obj, "tried to convert to double"); - } - return value; - } - - static Result Convert(const Decimal128Type* type, const O&, I obj) { - Decimal128 value; - RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); - return value; - } - - static Result Convert(const Decimal256Type* type, const O&, I obj) { - Decimal256 value; - RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); - return value; - } - - static Result Convert(const Date32Type*, const O&, I obj) { - int32_t value; - if (PyDate_Check(obj)) { - auto pydate = reinterpret_cast(obj); - value = static_cast(internal::PyDate_to_days(pydate)); - } else { - RETURN_NOT_OK( - internal::CIntFromPython(obj, &value, "Integer too large for date32")); - } - return value; - } - - static Result Convert(const Date64Type*, const O&, I obj) { - int64_t value; - if (PyDateTime_Check(obj)) { - auto pydate = reinterpret_cast(obj); - value = internal::PyDateTime_to_ms(pydate); - // Truncate any intraday milliseconds - // TODO: introduce an option for this - value -= value % 86400000LL; - } else if (PyDate_Check(obj)) { - auto pydate = reinterpret_cast(obj); - value = internal::PyDate_to_ms(pydate); - } else { - RETURN_NOT_OK( - internal::CIntFromPython(obj, &value, "Integer too large for date64")); - } - return value; - } - - static Result Convert(const Time32Type* type, const O&, I obj) { - int32_t value; - if (PyTime_Check(obj)) { - switch (type->unit()) { - case TimeUnit::SECOND: - value = static_cast(internal::PyTime_to_s(obj)); - break; - case TimeUnit::MILLI: - value = static_cast(internal::PyTime_to_ms(obj)); - break; - default: - return Status::UnknownError("Invalid time unit"); - } - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32")); - } - return value; - } - - static Result Convert(const Time64Type* type, const O&, I obj) { - int64_t value; - if (PyTime_Check(obj)) { - switch (type->unit()) { - case TimeUnit::MICRO: - value = internal::PyTime_to_us(obj); - break; - case TimeUnit::NANO: - value = internal::PyTime_to_ns(obj); - break; - default: - return Status::UnknownError("Invalid time unit"); - } - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64")); - } - return value; - } - - static Result Convert(const TimestampType* type, const O& options, I obj) { - int64_t value, offset; - if (PyDateTime_Check(obj)) { - if (ARROW_PREDICT_FALSE(options.ignore_timezone)) { - offset = 0; - } else { - ARROW_ASSIGN_OR_RAISE(offset, internal::PyDateTime_utcoffset_s(obj)); - } - auto dt = reinterpret_cast(obj); - switch (type->unit()) { - case TimeUnit::SECOND: - value = internal::PyDateTime_to_s(dt) - offset; - break; - case TimeUnit::MILLI: - value = internal::PyDateTime_to_ms(dt) - offset * 1000LL; - break; - case TimeUnit::MICRO: - value = internal::PyDateTime_to_us(dt) - offset * 1000000LL; - break; - case TimeUnit::NANO: - if (internal::IsPandasTimestamp(obj)) { - // pd.Timestamp value attribute contains the offset from unix epoch - // so no adjustment for timezone is need. - OwnedRef nanos(PyObject_GetAttrString(obj, "value")); - RETURN_IF_PYERROR(); - RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value)); - } else { - // Conversion to nanoseconds can overflow -> check multiply of microseconds - value = internal::PyDateTime_to_us(dt); - if (arrow::internal::MultiplyWithOverflow(value, 1000LL, &value)) { - return internal::InvalidValue(obj, - "out of bounds for nanosecond resolution"); - } - - // Adjust with offset and check for overflow - if (arrow::internal::SubtractWithOverflow(value, offset * 1000000000LL, - &value)) { - return internal::InvalidValue(obj, - "out of bounds for nanosecond resolution"); - } - } - break; - default: - return Status::UnknownError("Invalid time unit"); - } - } else if (PyArray_CheckAnyScalarExact(obj)) { - // validate that the numpy scalar has np.datetime64 dtype - std::shared_ptr numpy_type; - RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); - if (!numpy_type->Equals(*type)) { - return Status::NotImplemented("Expected np.datetime64 but got: ", - numpy_type->ToString()); - } - return reinterpret_cast(obj)->obval; - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); - } - return value; - } - - static Result Convert( - const MonthDayNanoIntervalType* /*type*/, const O& /*options*/, I obj) { - MonthDayNanoIntervalType::MonthDayNanos output; - bool found_attrs = false; - RETURN_NOT_OK(PopulateMonthDayNano::Field( - obj, &output.months, &found_attrs)); - // on relativeoffset weeks is a property calculated from days. On - // DateOffset is is a field on its own. timedelta doesn't have a weeks - // attribute. - PyObject* pandas_date_offset_type = internal::BorrowPandasDataOffsetType(); - bool is_date_offset = pandas_date_offset_type == (PyObject*)Py_TYPE(obj); - if (!is_date_offset) { - RETURN_NOT_OK(PopulateMonthDayNano::Field( - obj, &output.days, &found_attrs)); - } else { - RETURN_NOT_OK(PopulateMonthDayNano::Field( - obj, &output.days, &found_attrs)); - } - RETURN_NOT_OK(PopulateMonthDayNano::Field( - obj, &output.nanoseconds, &found_attrs)); - - // date_offset can have zero fields. - if (found_attrs || is_date_offset) { - return output; - } - if (PyTuple_Check(obj) && PyTuple_Size(obj) == 3) { - RETURN_NOT_OK(internal::CIntFromPython(PyTuple_GET_ITEM(obj, 0), &output.months, - "Months (tuple item #0) too large")); - RETURN_NOT_OK(internal::CIntFromPython(PyTuple_GET_ITEM(obj, 1), &output.days, - "Days (tuple item #1) too large")); - RETURN_NOT_OK(internal::CIntFromPython(PyTuple_GET_ITEM(obj, 2), - &output.nanoseconds, - "Nanoseconds (tuple item #2) too large")); - return output; - } - return Status::TypeError("No temporal attributes found on object."); - } - - static Result Convert(const DurationType* type, const O&, I obj) { - int64_t value; - if (PyDelta_Check(obj)) { - auto dt = reinterpret_cast(obj); - switch (type->unit()) { - case TimeUnit::SECOND: - value = internal::PyDelta_to_s(dt); - break; - case TimeUnit::MILLI: - value = internal::PyDelta_to_ms(dt); - break; - case TimeUnit::MICRO: { - ARROW_ASSIGN_OR_RAISE(value, internal::PyDelta_to_us(dt)); - break; - } - case TimeUnit::NANO: - if (internal::IsPandasTimedelta(obj)) { - OwnedRef nanos(PyObject_GetAttrString(obj, "value")); - RETURN_IF_PYERROR(); - RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value)); - } else { - ARROW_ASSIGN_OR_RAISE(value, internal::PyDelta_to_ns(dt)); - } - break; - default: - return Status::UnknownError("Invalid time unit"); - } - } else if (PyArray_CheckAnyScalarExact(obj)) { - // validate that the numpy scalar has np.datetime64 dtype - std::shared_ptr numpy_type; - RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); - if (!numpy_type->Equals(*type)) { - return Status::NotImplemented("Expected np.timedelta64 but got: ", - numpy_type->ToString()); - } - return reinterpret_cast(obj)->obval; - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); - } - return value; - } - - // The binary-like intermediate representation is PyBytesView because it keeps temporary - // python objects alive (non-contiguous memoryview) and stores whether the original - // object was unicode encoded or not, which is used for unicode -> bytes coersion if - // there is a non-unicode object observed. - - static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) { - return view.ParseString(obj); - } - - static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, - PyBytesView& view) { - ARROW_RETURN_NOT_OK(view.ParseString(obj)); - if (view.size != type->byte_width()) { - std::stringstream ss; - ss << "expected to be length " << type->byte_width() << " was " << view.size; - return internal::InvalidValue(obj, ss.str()); - } else { - return Status::OK(); - } - } - - template - static enable_if_string Convert(const T*, const O& options, I obj, - PyBytesView& view) { - if (options.strict) { - // Strict conversion, force output to be unicode / utf8 and validate that - // any binary values are utf8 - ARROW_RETURN_NOT_OK(view.ParseString(obj, true)); - if (!view.is_utf8) { - return internal::InvalidValue(obj, "was not a utf8 string"); - } - return Status::OK(); - } else { - // Non-strict conversion; keep track of whether values are unicode or bytes - return view.ParseString(obj); - } - } - - static Result Convert(const DataType* type, const O&, I obj) { - return Status::NotImplemented("PyValue::Convert is not implemented for type ", type); - } -}; - -// The base Converter class is a mixin with predefined behavior and constructors. -class PyConverter : public Converter { - public: - // Iterate over the input values and defer the conversion to the Append method - Status Extend(PyObject* values, int64_t size, int64_t offset = 0) override { - DCHECK_GE(size, offset); - /// Ensure we've allocated enough space - RETURN_NOT_OK(this->Reserve(size - offset)); - // Iterate over the items adding each one - return internal::VisitSequence( - values, offset, - [this](PyObject* item, bool* /* unused */) { return this->Append(item); }); - } - - // Convert and append a sequence of values masked with a numpy array - Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size, - int64_t offset = 0) override { - DCHECK_GE(size, offset); - /// Ensure we've allocated enough space - RETURN_NOT_OK(this->Reserve(size - offset)); - // Iterate over the items adding each one - return internal::VisitSequenceMasked( - values, mask, offset, [this](PyObject* item, bool is_masked, bool* /* unused */) { - if (is_masked) { - return this->AppendNull(); - } else { - // This will also apply the null-checking convention in the event - // that the value is not masked - return this->Append(item); // perhaps use AppendValue instead? - } - }); - } -}; - -template -class PyPrimitiveConverter; - -template -class PyListConverter; - -template -class PyDictionaryConverter; - -class PyStructConverter; - -template -struct PyConverterTrait; - -template -struct PyConverterTrait< - T, enable_if_t<(!is_nested_type::value && !is_interval_type::value && - !is_extension_type::value) || - std::is_same::value>> { - using type = PyPrimitiveConverter; -}; - -template -struct PyConverterTrait> { - using type = PyListConverter; -}; - -template <> -struct PyConverterTrait { - using type = PyStructConverter; -}; - -template <> -struct PyConverterTrait { - template - using dictionary_type = PyDictionaryConverter; -}; - -template -class PyPrimitiveConverter> - : public PrimitiveConverter { - public: - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - return this->primitive_builder_->AppendNull(); - } else if (arrow::py::is_scalar(value)) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, - arrow::py::unwrap_scalar(value)); - if (scalar->is_valid) { - return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(), - " to builder for type null"); - } else { - return this->primitive_builder_->AppendNull(); - } - } else { - ARROW_ASSIGN_OR_RAISE( - auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); - return this->primitive_builder_->Append(converted); - } - } -}; - -template -class PyPrimitiveConverter< - T, enable_if_t::value || is_number_type::value || - is_decimal_type::value || is_date_type::value || - is_time_type::value || - std::is_same::value>> - : public PrimitiveConverter { - public: - Status Append(PyObject* value) override { - // Since the required space has been already allocated in the Extend functions we can - // rely on the Unsafe builder API which improves the performance. - if (PyValue::IsNull(this->options_, value)) { - this->primitive_builder_->UnsafeAppendNull(); - } else if (arrow::py::is_scalar(value)) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, - arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); - } else { - ARROW_ASSIGN_OR_RAISE( - auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); - this->primitive_builder_->UnsafeAppend(converted); - } - return Status::OK(); - } -}; - -template -class PyPrimitiveConverter< - T, enable_if_t::value || is_duration_type::value>> - : public PrimitiveConverter { - public: - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - this->primitive_builder_->UnsafeAppendNull(); - } else if (arrow::py::is_scalar(value)) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, - arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); - } else { - ARROW_ASSIGN_OR_RAISE( - auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); - // Numpy NaT sentinels can be checked after the conversion - if (PyArray_CheckAnyScalarExact(value) && - PyValue::IsNaT(this->primitive_type_, converted)) { - this->primitive_builder_->UnsafeAppendNull(); - } else { - this->primitive_builder_->UnsafeAppend(converted); - } - } - return Status::OK(); - } -}; - -template -class PyPrimitiveConverter::value>> - : public PrimitiveConverter { - public: - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - this->primitive_builder_->UnsafeAppendNull(); - } else if (arrow::py::is_scalar(value)) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, - arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); - } else { - ARROW_RETURN_NOT_OK( - PyValue::Convert(this->primitive_type_, this->options_, value, view_)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); - this->primitive_builder_->UnsafeAppend(view_.bytes); - } - return Status::OK(); - } - - protected: - PyBytesView view_; -}; - -template -class PyPrimitiveConverter> - : public PrimitiveConverter { - public: - using OffsetType = typename T::offset_type; - - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - this->primitive_builder_->UnsafeAppendNull(); - } else if (arrow::py::is_scalar(value)) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, - arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); - } else { - ARROW_RETURN_NOT_OK( - PyValue::Convert(this->primitive_type_, this->options_, value, view_)); - if (!view_.is_utf8) { - // observed binary value - observed_binary_ = true; - } - // Since we don't know the varying length input size in advance, we need to - // reserve space in the value builder one by one. ReserveData raises CapacityError - // if the value would not fit into the array. - ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); - this->primitive_builder_->UnsafeAppend(view_.bytes, - static_cast(view_.size)); - } - return Status::OK(); - } - - Result> ToArray() override { - ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::ToArray())); - if (observed_binary_) { - // if we saw any non-unicode, cast results to BinaryArray - auto binary_type = TypeTraits::type_singleton(); - return array->View(binary_type); - } else { - return array; - } - } - - protected: - PyBytesView view_; - bool observed_binary_ = false; -}; - -template -class PyDictionaryConverter> - : public DictionaryConverter { - public: - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - return this->value_builder_->AppendNull(); - } else if (arrow::py::is_scalar(value)) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, - arrow::py::unwrap_scalar(value)); - return this->value_builder_->AppendScalar(*scalar, 1); - } else { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->value_type_, this->options_, value)); - return this->value_builder_->Append(converted); - } - } -}; - -template -class PyDictionaryConverter> - : public DictionaryConverter { - public: - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - return this->value_builder_->AppendNull(); - } else if (arrow::py::is_scalar(value)) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, - arrow::py::unwrap_scalar(value)); - return this->value_builder_->AppendScalar(*scalar, 1); - } else { - ARROW_RETURN_NOT_OK( - PyValue::Convert(this->value_type_, this->options_, value, view_)); - return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); - } - } - - protected: - PyBytesView view_; -}; - -template -class PyListConverter : public ListConverter { - public: - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - return this->list_builder_->AppendNull(); - } - - RETURN_NOT_OK(this->list_builder_->Append()); - if (PyArray_Check(value)) { - RETURN_NOT_OK(AppendNdarray(value)); - } else if (PySequence_Check(value)) { - RETURN_NOT_OK(AppendSequence(value)); - } else if (PySet_Check(value) || (Py_TYPE(value) == &PyDictValues_Type)) { - RETURN_NOT_OK(AppendIterable(value)); - } else if (PyDict_Check(value) && this->type()->id() == Type::MAP) { - // Branch to support Python Dict with `map` DataType. - auto items = PyDict_Items(value); - OwnedRef item_ref(items); - RETURN_NOT_OK(AppendSequence(items)); - } else { - return internal::InvalidType( - value, "was not a sequence or recognized null for conversion to list type"); - } - - return ValidateBuilder(this->list_type_); - } - - protected: - Status ValidateBuilder(const MapType*) { - if (this->list_builder_->key_builder()->null_count() > 0) { - return Status::Invalid("Invalid Map: key field can not contain null values"); - } else { - return Status::OK(); - } - } - - Status ValidateBuilder(const BaseListType*) { return Status::OK(); } - - Status AppendSequence(PyObject* value) { - int64_t size = static_cast(PySequence_Size(value)); - RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); - return this->value_converter_->Extend(value, size); - } - - Status AppendIterable(PyObject* value) { - PyObject* iterator = PyObject_GetIter(value); - OwnedRef iter_ref(iterator); - while (PyObject* item = PyIter_Next(iterator)) { - OwnedRef item_ref(item); - RETURN_NOT_OK(this->value_converter_->Reserve(1)); - RETURN_NOT_OK(this->value_converter_->Append(item)); - } - return Status::OK(); - } - - Status AppendNdarray(PyObject* value) { - PyArrayObject* ndarray = reinterpret_cast(value); - if (PyArray_NDIM(ndarray) != 1) { - return Status::Invalid("Can only convert 1-dimensional array values"); - } - const int64_t size = PyArray_SIZE(ndarray); - RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); - - const auto value_type = this->value_converter_->builder()->type(); - switch (value_type->id()) { -// If the value type does not match the expected NumPy dtype, then fall through -// to a slower PySequence-based path -#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ - case Type::TYPE_ID: { \ - if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ - return this->value_converter_->Extend(value, size); \ - } \ - return AppendNdarrayTyped(ndarray); \ - } - LIST_FAST_CASE(BOOL, BooleanType, NPY_BOOL) - LIST_FAST_CASE(UINT8, UInt8Type, NPY_UINT8) - LIST_FAST_CASE(INT8, Int8Type, NPY_INT8) - LIST_FAST_CASE(UINT16, UInt16Type, NPY_UINT16) - LIST_FAST_CASE(INT16, Int16Type, NPY_INT16) - LIST_FAST_CASE(UINT32, UInt32Type, NPY_UINT32) - LIST_FAST_CASE(INT32, Int32Type, NPY_INT32) - LIST_FAST_CASE(UINT64, UInt64Type, NPY_UINT64) - LIST_FAST_CASE(INT64, Int64Type, NPY_INT64) - LIST_FAST_CASE(HALF_FLOAT, HalfFloatType, NPY_FLOAT16) - LIST_FAST_CASE(FLOAT, FloatType, NPY_FLOAT) - LIST_FAST_CASE(DOUBLE, DoubleType, NPY_DOUBLE) - LIST_FAST_CASE(TIMESTAMP, TimestampType, NPY_DATETIME) - LIST_FAST_CASE(DURATION, DurationType, NPY_TIMEDELTA) -#undef LIST_FAST_CASE - default: { - return this->value_converter_->Extend(value, size); - } - } - } - - template - Status AppendNdarrayTyped(PyArrayObject* ndarray) { - // no need to go through the conversion - using NumpyTrait = internal::npy_traits; - using NumpyType = typename NumpyTrait::value_type; - using ValueBuilderType = typename TypeTraits::BuilderType; - - const bool null_sentinels_possible = - // Always treat Numpy's NaT as null - NUMPY_TYPE == NPY_DATETIME || NUMPY_TYPE == NPY_TIMEDELTA || - // Observing pandas's null sentinels - (this->options_.from_pandas && NumpyTrait::supports_nulls); - - auto value_builder = - checked_cast(this->value_converter_->builder().get()); - - Ndarray1DIndexer values(ndarray); - if (null_sentinels_possible) { - for (int64_t i = 0; i < values.size(); ++i) { - if (NumpyTrait::isnull(values[i])) { - RETURN_NOT_OK(value_builder->AppendNull()); - } else { - RETURN_NOT_OK(value_builder->Append(values[i])); - } - } - } else if (!values.is_strided()) { - RETURN_NOT_OK(value_builder->AppendValues(values.data(), values.size())); - } else { - for (int64_t i = 0; i < values.size(); ++i) { - RETURN_NOT_OK(value_builder->Append(values[i])); - } - } - return Status::OK(); - } -}; - -class PyStructConverter : public StructConverter { - public: - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - return this->struct_builder_->AppendNull(); - } else if (arrow::py::is_scalar(value)) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, - arrow::py::unwrap_scalar(value)); - return this->struct_builder_->AppendScalar(*scalar); - } - switch (input_kind_) { - case InputKind::DICT: - RETURN_NOT_OK(this->struct_builder_->Append()); - return AppendDict(value); - case InputKind::TUPLE: - RETURN_NOT_OK(this->struct_builder_->Append()); - return AppendTuple(value); - case InputKind::ITEMS: - RETURN_NOT_OK(this->struct_builder_->Append()); - return AppendItems(value); - default: - RETURN_NOT_OK(InferInputKind(value)); - return Append(value); - } - } - - protected: - Status Init(MemoryPool* pool) override { - RETURN_NOT_OK((StructConverter::Init(pool))); - - // Store the field names as a PyObjects for dict matching - num_fields_ = this->struct_type_->num_fields(); - bytes_field_names_.reset(PyList_New(num_fields_)); - unicode_field_names_.reset(PyList_New(num_fields_)); - RETURN_IF_PYERROR(); - - for (int i = 0; i < num_fields_; i++) { - const auto& field_name = this->struct_type_->field(i)->name(); - PyObject* bytes = PyBytes_FromStringAndSize(field_name.c_str(), field_name.size()); - PyObject* unicode = - PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size()); - RETURN_IF_PYERROR(); - PyList_SET_ITEM(bytes_field_names_.obj(), i, bytes); - PyList_SET_ITEM(unicode_field_names_.obj(), i, unicode); - } - return Status::OK(); - } - - Status InferInputKind(PyObject* value) { - // Infer input object's type, note that heterogeneous sequences are not allowed - if (PyDict_Check(value)) { - input_kind_ = InputKind::DICT; - } else if (PyTuple_Check(value)) { - input_kind_ = InputKind::TUPLE; - } else if (PySequence_Check(value)) { - input_kind_ = InputKind::ITEMS; - } else { - return internal::InvalidType(value, - "was not a dict, tuple, or recognized null value " - "for conversion to struct type"); - } - return Status::OK(); - } - - Status InferKeyKind(PyObject* items) { - for (int i = 0; i < PySequence_Length(items); i++) { - // retrieve the key from the passed key-value pairs - ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i)); - - // check key exists between the unicode field names - bool do_contain = PySequence_Contains(unicode_field_names_.obj(), pair.first); - RETURN_IF_PYERROR(); - if (do_contain) { - key_kind_ = KeyKind::UNICODE; - return Status::OK(); - } - - // check key exists between the bytes field names - do_contain = PySequence_Contains(bytes_field_names_.obj(), pair.first); - RETURN_IF_PYERROR(); - if (do_contain) { - key_kind_ = KeyKind::BYTES; - return Status::OK(); - } - } - return Status::OK(); - } - - Status AppendEmpty() { - for (int i = 0; i < num_fields_; i++) { - RETURN_NOT_OK(this->children_[i]->Append(Py_None)); - } - return Status::OK(); - } - - Status AppendTuple(PyObject* tuple) { - if (!PyTuple_Check(tuple)) { - return internal::InvalidType(tuple, "was expecting a tuple"); - } - if (PyTuple_GET_SIZE(tuple) != num_fields_) { - return Status::Invalid("Tuple size must be equal to number of struct fields"); - } - for (int i = 0; i < num_fields_; i++) { - PyObject* value = PyTuple_GET_ITEM(tuple, i); - RETURN_NOT_OK(this->children_[i]->Append(value)); - } - return Status::OK(); - } - - Status AppendDict(PyObject* dict) { - if (!PyDict_Check(dict)) { - return internal::InvalidType(dict, "was expecting a dict"); - } - switch (key_kind_) { - case KeyKind::UNICODE: - return AppendDict(dict, unicode_field_names_.obj()); - case KeyKind::BYTES: - return AppendDict(dict, bytes_field_names_.obj()); - default: - RETURN_NOT_OK(InferKeyKind(PyDict_Items(dict))); - if (key_kind_ == KeyKind::UNKNOWN) { - // was unable to infer the type which means that all keys are absent - return AppendEmpty(); - } else { - return AppendDict(dict); - } - } - } - - Status AppendItems(PyObject* items) { - if (!PySequence_Check(items)) { - return internal::InvalidType(items, "was expecting a sequence of key-value items"); - } - switch (key_kind_) { - case KeyKind::UNICODE: - return AppendItems(items, unicode_field_names_.obj()); - case KeyKind::BYTES: - return AppendItems(items, bytes_field_names_.obj()); - default: - RETURN_NOT_OK(InferKeyKind(items)); - if (key_kind_ == KeyKind::UNKNOWN) { - // was unable to infer the type which means that all keys are absent - return AppendEmpty(); - } else { - return AppendItems(items); - } - } - } - - Status AppendDict(PyObject* dict, PyObject* field_names) { - // NOTE we're ignoring any extraneous dict items - for (int i = 0; i < num_fields_; i++) { - PyObject* name = PyList_GET_ITEM(field_names, i); // borrowed - PyObject* value = PyDict_GetItem(dict, name); // borrowed - if (value == NULL) { - RETURN_IF_PYERROR(); - } - RETURN_NOT_OK(this->children_[i]->Append(value ? value : Py_None)); - } - return Status::OK(); - } - - Result> GetKeyValuePair(PyObject* seq, int index) { - PyObject* pair = PySequence_GetItem(seq, index); - RETURN_IF_PYERROR(); - if (!PyTuple_Check(pair) || PyTuple_Size(pair) != 2) { - return internal::InvalidType(pair, "was expecting tuple of (key, value) pair"); - } - PyObject* key = PyTuple_GetItem(pair, 0); - RETURN_IF_PYERROR(); - PyObject* value = PyTuple_GetItem(pair, 1); - RETURN_IF_PYERROR(); - return std::make_pair(key, value); - } - - Status AppendItems(PyObject* items, PyObject* field_names) { - auto length = static_cast(PySequence_Size(items)); - RETURN_IF_PYERROR(); - - // append the values for the defined fields - for (int i = 0; i < std::min(num_fields_, length); i++) { - // retrieve the key-value pair - ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i)); - - // validate that the key and the field name are equal - PyObject* name = PyList_GET_ITEM(field_names, i); - bool are_equal = PyObject_RichCompareBool(pair.first, name, Py_EQ); - RETURN_IF_PYERROR(); - - // finally append to the respective child builder - if (are_equal) { - RETURN_NOT_OK(this->children_[i]->Append(pair.second)); - } else { - ARROW_ASSIGN_OR_RAISE(auto key_view, PyBytesView::FromString(pair.first)); - ARROW_ASSIGN_OR_RAISE(auto name_view, PyBytesView::FromString(name)); - return Status::Invalid("The expected field name is `", name_view.bytes, "` but `", - key_view.bytes, "` was given"); - } - } - // insert null values for missing fields - for (int i = length; i < num_fields_; i++) { - RETURN_NOT_OK(this->children_[i]->AppendNull()); - } - return Status::OK(); - } - - // Whether we're converting from a sequence of dicts or tuples or list of pairs - enum class InputKind { UNKNOWN, DICT, TUPLE, ITEMS } input_kind_ = InputKind::UNKNOWN; - // Whether the input dictionary keys' type is python bytes or unicode - enum class KeyKind { UNKNOWN, BYTES, UNICODE } key_kind_ = KeyKind::UNKNOWN; - // Store the field names as a PyObjects for dict matching - OwnedRef bytes_field_names_; - OwnedRef unicode_field_names_; - // Store the number of fields for later reuse - int num_fields_; -}; - -// Convert *obj* to a sequence if necessary -// Fill *size* to its length. If >= 0 on entry, *size* is an upper size -// bound that may lead to truncation. -Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* size) { - if (PySequence_Check(obj)) { - // obj is already a sequence - int64_t real_size = static_cast(PySequence_Size(obj)); - RETURN_IF_PYERROR(); - if (*size < 0) { - *size = real_size; - } else { - *size = std::min(real_size, *size); - } - Py_INCREF(obj); - *seq = obj; - } else if (*size < 0) { - // unknown size, exhaust iterator - *seq = PySequence_List(obj); - RETURN_IF_PYERROR(); - *size = static_cast(PyList_GET_SIZE(*seq)); - } else { - // size is known but iterator could be infinite - Py_ssize_t i, n = *size; - PyObject* iter = PyObject_GetIter(obj); - RETURN_IF_PYERROR(); - OwnedRef iter_ref(iter); - PyObject* lst = PyList_New(n); - RETURN_IF_PYERROR(); - for (i = 0; i < n; i++) { - PyObject* item = PyIter_Next(iter); - if (!item) { - // either an error occurred or the iterator ended - RETURN_IF_PYERROR(); - break; - } - PyList_SET_ITEM(lst, i, item); - } - // Shrink list if len(iterator) < size - if (i < n && PyList_SetSlice(lst, i, n, NULL)) { - Py_DECREF(lst); - RETURN_IF_PYERROR(); - } - *seq = lst; - *size = std::min(i, *size); - } - return Status::OK(); -} - -} // namespace - -Result> ConvertPySequence(PyObject* obj, PyObject* mask, - PyConversionOptions options, - MemoryPool* pool) { - PyAcquireGIL lock; - - PyObject* seq = nullptr; - OwnedRef tmp_seq_nanny; - - ARROW_ASSIGN_OR_RAISE(auto is_pandas_imported, internal::IsModuleImported("pandas")); - if (is_pandas_imported) { - // If pandas has been already imported initialize the static pandas objects to - // support converting from pd.Timedelta and pd.Timestamp objects - internal::InitPandasStaticData(); - } - - int64_t size = options.size; - RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size)); - tmp_seq_nanny.reset(seq); - - // In some cases, type inference may be "loose", like strings. If the user - // passed pa.string(), then we will error if we encounter any non-UTF8 - // value. If not, then we will allow the result to be a BinaryArray - if (options.type == nullptr) { - ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas)); - options.strict = false; - } else { - options.strict = true; - } - DCHECK_GE(size, 0); - - ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( - options.type, options, pool))); - if (converter->may_overflow()) { - // The converter hierarchy contains binary- or list-like builders which can overflow - // depending on the input values. Wrap the converter with a chunker which detects - // the overflow and automatically creates new chunks. - ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(std::move(converter))); - if (mask != nullptr && mask != Py_None) { - RETURN_NOT_OK(chunked_converter->ExtendMasked(seq, mask, size)); - } else { - RETURN_NOT_OK(chunked_converter->Extend(seq, size)); - } - return chunked_converter->ToChunkedArray(); - } else { - // If the converter can't overflow spare the capacity error checking on the hot-path, - // this improves the performance roughly by ~10% for primitive types. - if (mask != nullptr && mask != Py_None) { - RETURN_NOT_OK(converter->ExtendMasked(seq, mask, size)); - } else { - RETURN_NOT_OK(converter->Extend(seq, size)); - } - return converter->ToChunkedArray(); - } -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.h b/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.h deleted file mode 100644 index d167996..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.h +++ /dev/null @@ -1,80 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between CPython built-in data structures and Arrow -// data structures - -#pragma once - -#include "arrow/python/platform.h" - -#include -#include - -#include "arrow/python/visibility.h" -#include "arrow/type.h" -#include "arrow/util/macros.h" - -#include "arrow/python/common.h" - -namespace arrow { - -class Array; -class Status; - -namespace py { - -struct PyConversionOptions { - PyConversionOptions() = default; - - PyConversionOptions(const std::shared_ptr& type, int64_t size, - MemoryPool* pool, bool from_pandas) - : type(type), size(size), from_pandas(from_pandas) {} - - // Set to null if to be inferred - std::shared_ptr type; - - // Default is -1, which indicates the size should the same as the input sequence - int64_t size = -1; - - bool from_pandas = false; - - /// Used to maintain backwards compatibility for - /// timezone bugs (see ARROW-9528). Should be removed - /// after Arrow 2.0 release. - bool ignore_timezone = false; - - bool strict = false; -}; - -/// \brief Convert sequence (list, generator, NumPy array with dtype object) of -/// Python objects. -/// \param[in] obj the sequence to convert -/// \param[in] mask a NumPy array of true/false values to indicate whether -/// values in the sequence are null (true) or not null (false). This parameter -/// may be null -/// \param[in] options various conversion options -/// \param[in] pool MemoryPool to use for allocations -/// \return Result ChunkedArray -ARROW_PYTHON_EXPORT -Result> ConvertPySequence( - PyObject* obj, PyObject* mask, PyConversionOptions options, - MemoryPool* pool = default_memory_pool()); - -} // namespace py - -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.cc deleted file mode 100644 index ad079cb..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.cc +++ /dev/null @@ -1,798 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/serialize.h" -#include "arrow/python/numpy_interop.h" - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "arrow/array.h" -#include "arrow/array/builder_binary.h" -#include "arrow/array/builder_nested.h" -#include "arrow/array/builder_primitive.h" -#include "arrow/array/builder_union.h" -#include "arrow/io/interfaces.h" -#include "arrow/io/memory.h" -#include "arrow/ipc/util.h" -#include "arrow/ipc/writer.h" -#include "arrow/record_batch.h" -#include "arrow/result.h" -#include "arrow/tensor.h" -#include "arrow/util/logging.h" - -#include "arrow/python/common.h" -#include "arrow/python/datetime.h" -#include "arrow/python/helpers.h" -#include "arrow/python/iterators.h" -#include "arrow/python/numpy_convert.h" -#include "arrow/python/platform.h" -#include "arrow/python/pyarrow.h" - -constexpr int32_t kMaxRecursionDepth = 100; - -namespace arrow { - -using internal::checked_cast; - -namespace py { - -class SequenceBuilder; -class DictBuilder; - -Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder, - int32_t recursion_depth, SerializedPyObject* blobs_out); - -// A Sequence is a heterogeneous collections of elements. It can contain -// scalar Python types, lists, tuples, dictionaries, tensors and sparse tensors. -class SequenceBuilder { - public: - explicit SequenceBuilder(MemoryPool* pool = default_memory_pool()) - : pool_(pool), - types_(::arrow::int8(), pool), - offsets_(::arrow::int32(), pool), - type_map_(PythonType::NUM_PYTHON_TYPES, -1) { - auto null_builder = std::make_shared(pool); - auto initial_ty = dense_union({field("0", null())}); - builder_.reset(new DenseUnionBuilder(pool, {null_builder}, initial_ty)); - } - - // Appending a none to the sequence - Status AppendNone() { return builder_->AppendNull(); } - - template - Status CreateAndUpdate(std::shared_ptr* child_builder, int8_t tag, - MakeBuilderFn make_builder) { - if (!*child_builder) { - child_builder->reset(make_builder()); - std::ostringstream convert; - convert.imbue(std::locale::classic()); - convert << static_cast(tag); - type_map_[tag] = builder_->AppendChild(*child_builder, convert.str()); - } - return builder_->Append(type_map_[tag]); - } - - template - Status AppendPrimitive(std::shared_ptr* child_builder, const T val, - int8_t tag) { - RETURN_NOT_OK( - CreateAndUpdate(child_builder, tag, [this]() { return new BuilderType(pool_); })); - return (*child_builder)->Append(val); - } - - // Appending a boolean to the sequence - Status AppendBool(const bool data) { - return AppendPrimitive(&bools_, data, PythonType::BOOL); - } - - // Appending an int64_t to the sequence - Status AppendInt64(const int64_t data) { - return AppendPrimitive(&ints_, data, PythonType::INT); - } - - // Append a list of bytes to the sequence - Status AppendBytes(const uint8_t* data, int32_t length) { - RETURN_NOT_OK(CreateAndUpdate(&bytes_, PythonType::BYTES, - [this]() { return new BinaryBuilder(pool_); })); - return bytes_->Append(data, length); - } - - // Appending a string to the sequence - Status AppendString(const char* data, int32_t length) { - RETURN_NOT_OK(CreateAndUpdate(&strings_, PythonType::STRING, - [this]() { return new StringBuilder(pool_); })); - return strings_->Append(data, length); - } - - // Appending a half_float to the sequence - Status AppendHalfFloat(const npy_half data) { - return AppendPrimitive(&half_floats_, data, PythonType::HALF_FLOAT); - } - - // Appending a float to the sequence - Status AppendFloat(const float data) { - return AppendPrimitive(&floats_, data, PythonType::FLOAT); - } - - // Appending a double to the sequence - Status AppendDouble(const double data) { - return AppendPrimitive(&doubles_, data, PythonType::DOUBLE); - } - - // Appending a Date64 timestamp to the sequence - Status AppendDate64(const int64_t timestamp) { - return AppendPrimitive(&date64s_, timestamp, PythonType::DATE64); - } - - // Appending a tensor to the sequence - // - // \param tensor_index Index of the tensor in the object. - Status AppendTensor(const int32_t tensor_index) { - RETURN_NOT_OK(CreateAndUpdate(&tensor_indices_, PythonType::TENSOR, - [this]() { return new Int32Builder(pool_); })); - return tensor_indices_->Append(tensor_index); - } - - // Appending a sparse coo tensor to the sequence - // - // \param sparse_coo_tensor_index Index of the sparse coo tensor in the object. - Status AppendSparseCOOTensor(const int32_t sparse_coo_tensor_index) { - RETURN_NOT_OK(CreateAndUpdate(&sparse_coo_tensor_indices_, - PythonType::SPARSECOOTENSOR, - [this]() { return new Int32Builder(pool_); })); - return sparse_coo_tensor_indices_->Append(sparse_coo_tensor_index); - } - - // Appending a sparse csr matrix to the sequence - // - // \param sparse_csr_matrix_index Index of the sparse csr matrix in the object. - Status AppendSparseCSRMatrix(const int32_t sparse_csr_matrix_index) { - RETURN_NOT_OK(CreateAndUpdate(&sparse_csr_matrix_indices_, - PythonType::SPARSECSRMATRIX, - [this]() { return new Int32Builder(pool_); })); - return sparse_csr_matrix_indices_->Append(sparse_csr_matrix_index); - } - - // Appending a sparse csc matrix to the sequence - // - // \param sparse_csc_matrix_index Index of the sparse csc matrix in the object. - Status AppendSparseCSCMatrix(const int32_t sparse_csc_matrix_index) { - RETURN_NOT_OK(CreateAndUpdate(&sparse_csc_matrix_indices_, - PythonType::SPARSECSCMATRIX, - [this]() { return new Int32Builder(pool_); })); - return sparse_csc_matrix_indices_->Append(sparse_csc_matrix_index); - } - - // Appending a sparse csf tensor to the sequence - // - // \param sparse_csf_tensor_index Index of the sparse csf tensor in the object. - Status AppendSparseCSFTensor(const int32_t sparse_csf_tensor_index) { - RETURN_NOT_OK(CreateAndUpdate(&sparse_csf_tensor_indices_, - PythonType::SPARSECSFTENSOR, - [this]() { return new Int32Builder(pool_); })); - return sparse_csf_tensor_indices_->Append(sparse_csf_tensor_index); - } - - // Appending a numpy ndarray to the sequence - // - // \param tensor_index Index of the tensor in the object. - Status AppendNdarray(const int32_t ndarray_index) { - RETURN_NOT_OK(CreateAndUpdate(&ndarray_indices_, PythonType::NDARRAY, - [this]() { return new Int32Builder(pool_); })); - return ndarray_indices_->Append(ndarray_index); - } - - // Appending a buffer to the sequence - // - // \param buffer_index Index of the buffer in the object. - Status AppendBuffer(const int32_t buffer_index) { - RETURN_NOT_OK(CreateAndUpdate(&buffer_indices_, PythonType::BUFFER, - [this]() { return new Int32Builder(pool_); })); - return buffer_indices_->Append(buffer_index); - } - - Status AppendSequence(PyObject* context, PyObject* sequence, int8_t tag, - std::shared_ptr& target_sequence, - std::unique_ptr& values, int32_t recursion_depth, - SerializedPyObject* blobs_out) { - if (recursion_depth >= kMaxRecursionDepth) { - return Status::NotImplemented( - "This object exceeds the maximum recursion depth. It may contain itself " - "recursively."); - } - RETURN_NOT_OK(CreateAndUpdate(&target_sequence, tag, [this, &values]() { - values.reset(new SequenceBuilder(pool_)); - return new ListBuilder(pool_, values->builder()); - })); - RETURN_NOT_OK(target_sequence->Append()); - return internal::VisitIterable( - sequence, [&](PyObject* obj, bool* keep_going /* unused */) { - return Append(context, obj, values.get(), recursion_depth, blobs_out); - }); - } - - Status AppendList(PyObject* context, PyObject* list, int32_t recursion_depth, - SerializedPyObject* blobs_out) { - return AppendSequence(context, list, PythonType::LIST, lists_, list_values_, - recursion_depth + 1, blobs_out); - } - - Status AppendTuple(PyObject* context, PyObject* tuple, int32_t recursion_depth, - SerializedPyObject* blobs_out) { - return AppendSequence(context, tuple, PythonType::TUPLE, tuples_, tuple_values_, - recursion_depth + 1, blobs_out); - } - - Status AppendSet(PyObject* context, PyObject* set, int32_t recursion_depth, - SerializedPyObject* blobs_out) { - return AppendSequence(context, set, PythonType::SET, sets_, set_values_, - recursion_depth + 1, blobs_out); - } - - Status AppendDict(PyObject* context, PyObject* dict, int32_t recursion_depth, - SerializedPyObject* blobs_out); - - // Finish building the sequence and return the result. - // Input arrays may be nullptr - Status Finish(std::shared_ptr* out) { return builder_->Finish(out); } - - std::shared_ptr builder() { return builder_; } - - private: - MemoryPool* pool_; - - Int8Builder types_; - Int32Builder offsets_; - - /// Mapping from PythonType to child index - std::vector type_map_; - - std::shared_ptr bools_; - std::shared_ptr ints_; - std::shared_ptr bytes_; - std::shared_ptr strings_; - std::shared_ptr half_floats_; - std::shared_ptr floats_; - std::shared_ptr doubles_; - std::shared_ptr date64s_; - - std::unique_ptr list_values_; - std::shared_ptr lists_; - std::unique_ptr dict_values_; - std::shared_ptr dicts_; - std::unique_ptr tuple_values_; - std::shared_ptr tuples_; - std::unique_ptr set_values_; - std::shared_ptr sets_; - - std::shared_ptr tensor_indices_; - std::shared_ptr sparse_coo_tensor_indices_; - std::shared_ptr sparse_csr_matrix_indices_; - std::shared_ptr sparse_csc_matrix_indices_; - std::shared_ptr sparse_csf_tensor_indices_; - std::shared_ptr ndarray_indices_; - std::shared_ptr buffer_indices_; - - std::shared_ptr builder_; -}; - -// Constructing dictionaries of key/value pairs. Sequences of -// keys and values are built separately using a pair of -// SequenceBuilders. The resulting Arrow representation -// can be obtained via the Finish method. -class DictBuilder { - public: - explicit DictBuilder(MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) { - builder_.reset(new StructBuilder(struct_({field("keys", dense_union(FieldVector{})), - field("vals", dense_union(FieldVector{}))}), - pool, {keys_.builder(), vals_.builder()})); - } - - // Builder for the keys of the dictionary - SequenceBuilder& keys() { return keys_; } - // Builder for the values of the dictionary - SequenceBuilder& vals() { return vals_; } - - // Construct an Arrow StructArray representing the dictionary. - // Contains a field "keys" for the keys and "vals" for the values. - Status Finish(std::shared_ptr* out) { return builder_->Finish(out); } - - std::shared_ptr builder() { return builder_; } - - private: - SequenceBuilder keys_; - SequenceBuilder vals_; - std::shared_ptr builder_; -}; - -Status SequenceBuilder::AppendDict(PyObject* context, PyObject* dict, - int32_t recursion_depth, - SerializedPyObject* blobs_out) { - if (recursion_depth >= kMaxRecursionDepth) { - return Status::NotImplemented( - "This object exceeds the maximum recursion depth. It may contain itself " - "recursively."); - } - RETURN_NOT_OK(CreateAndUpdate(&dicts_, PythonType::DICT, [this]() { - dict_values_.reset(new DictBuilder(pool_)); - return new ListBuilder(pool_, dict_values_->builder()); - })); - RETURN_NOT_OK(dicts_->Append()); - PyObject* key; - PyObject* value; - Py_ssize_t pos = 0; - while (PyDict_Next(dict, &pos, &key, &value)) { - RETURN_NOT_OK(dict_values_->builder()->Append()); - RETURN_NOT_OK( - Append(context, key, &dict_values_->keys(), recursion_depth + 1, blobs_out)); - RETURN_NOT_OK( - Append(context, value, &dict_values_->vals(), recursion_depth + 1, blobs_out)); - } - - // This block is used to decrement the reference counts of the results - // returned by the serialization callback, which is called in AppendArray, - // in DeserializeDict and in Append - static PyObject* py_type = PyUnicode_FromString("_pytype_"); - if (PyDict_Contains(dict, py_type)) { - // If the dictionary contains the key "_pytype_", then the user has to - // have registered a callback. - if (context == Py_None) { - return Status::Invalid("No serialization callback set"); - } - Py_XDECREF(dict); - } - return Status::OK(); -} - -Status CallCustomCallback(PyObject* context, PyObject* method_name, PyObject* elem, - PyObject** result) { - if (context == Py_None) { - *result = NULL; - return Status::SerializationError("error while calling callback on ", - internal::PyObject_StdStringRepr(elem), - ": handler not registered"); - } else { - *result = PyObject_CallMethodObjArgs(context, method_name, elem, NULL); - return CheckPyError(); - } -} - -Status CallSerializeCallback(PyObject* context, PyObject* value, - PyObject** serialized_object) { - OwnedRef method_name(PyUnicode_FromString("_serialize_callback")); - RETURN_NOT_OK(CallCustomCallback(context, method_name.obj(), value, serialized_object)); - if (!PyDict_Check(*serialized_object)) { - return Status::TypeError("serialization callback must return a valid dictionary"); - } - return Status::OK(); -} - -Status CallDeserializeCallback(PyObject* context, PyObject* value, - PyObject** deserialized_object) { - OwnedRef method_name(PyUnicode_FromString("_deserialize_callback")); - return CallCustomCallback(context, method_name.obj(), value, deserialized_object); -} - -Status AppendArray(PyObject* context, PyArrayObject* array, SequenceBuilder* builder, - int32_t recursion_depth, SerializedPyObject* blobs_out); - -template -Status AppendIntegerScalar(PyObject* obj, SequenceBuilder* builder) { - int64_t value = reinterpret_cast(obj)->obval; - return builder->AppendInt64(value); -} - -// Append a potentially 64-bit wide unsigned Numpy scalar. -// Must check for overflow as we reinterpret it as signed int64. -template -Status AppendLargeUnsignedScalar(PyObject* obj, SequenceBuilder* builder) { - constexpr uint64_t max_value = std::numeric_limits::max(); - - uint64_t value = reinterpret_cast(obj)->obval; - if (value > max_value) { - return Status::Invalid("cannot serialize Numpy uint64 scalar >= 2**63"); - } - return builder->AppendInt64(static_cast(value)); -} - -Status AppendScalar(PyObject* obj, SequenceBuilder* builder) { - if (PyArray_IsScalar(obj, Bool)) { - return builder->AppendBool(reinterpret_cast(obj)->obval != 0); - } else if (PyArray_IsScalar(obj, Half)) { - return builder->AppendHalfFloat(reinterpret_cast(obj)->obval); - } else if (PyArray_IsScalar(obj, Float)) { - return builder->AppendFloat(reinterpret_cast(obj)->obval); - } else if (PyArray_IsScalar(obj, Double)) { - return builder->AppendDouble(reinterpret_cast(obj)->obval); - } - if (PyArray_IsScalar(obj, Byte)) { - return AppendIntegerScalar(obj, builder); - } else if (PyArray_IsScalar(obj, Short)) { - return AppendIntegerScalar(obj, builder); - } else if (PyArray_IsScalar(obj, Int)) { - return AppendIntegerScalar(obj, builder); - } else if (PyArray_IsScalar(obj, Long)) { - return AppendIntegerScalar(obj, builder); - } else if (PyArray_IsScalar(obj, LongLong)) { - return AppendIntegerScalar(obj, builder); - } else if (PyArray_IsScalar(obj, Int64)) { - return AppendIntegerScalar(obj, builder); - } else if (PyArray_IsScalar(obj, UByte)) { - return AppendIntegerScalar(obj, builder); - } else if (PyArray_IsScalar(obj, UShort)) { - return AppendIntegerScalar(obj, builder); - } else if (PyArray_IsScalar(obj, UInt)) { - return AppendIntegerScalar(obj, builder); - } else if (PyArray_IsScalar(obj, ULong)) { - return AppendLargeUnsignedScalar(obj, builder); - } else if (PyArray_IsScalar(obj, ULongLong)) { - return AppendLargeUnsignedScalar(obj, builder); - } else if (PyArray_IsScalar(obj, UInt64)) { - return AppendLargeUnsignedScalar(obj, builder); - } - return Status::NotImplemented("Numpy scalar type not recognized"); -} - -Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder, - int32_t recursion_depth, SerializedPyObject* blobs_out) { - // The bool case must precede the int case (PyInt_Check passes for bools) - if (PyBool_Check(elem)) { - RETURN_NOT_OK(builder->AppendBool(elem == Py_True)); - } else if (PyArray_DescrFromScalar(elem)->type_num == NPY_HALF) { - npy_half halffloat = reinterpret_cast(elem)->obval; - RETURN_NOT_OK(builder->AppendHalfFloat(halffloat)); - } else if (PyFloat_Check(elem)) { - RETURN_NOT_OK(builder->AppendDouble(PyFloat_AS_DOUBLE(elem))); - } else if (PyLong_Check(elem)) { - int overflow = 0; - int64_t data = PyLong_AsLongLongAndOverflow(elem, &overflow); - if (!overflow) { - RETURN_NOT_OK(builder->AppendInt64(data)); - } else { - // Attempt to serialize the object using the custom callback. - PyObject* serialized_object; - // The reference count of serialized_object will be decremented in SerializeDict - RETURN_NOT_OK(CallSerializeCallback(context, elem, &serialized_object)); - RETURN_NOT_OK( - builder->AppendDict(context, serialized_object, recursion_depth, blobs_out)); - } - } else if (PyBytes_Check(elem)) { - auto data = reinterpret_cast(PyBytes_AS_STRING(elem)); - int32_t size = -1; - RETURN_NOT_OK(internal::CastSize(PyBytes_GET_SIZE(elem), &size)); - RETURN_NOT_OK(builder->AppendBytes(data, size)); - } else if (PyUnicode_Check(elem)) { - ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromUnicode(elem)); - int32_t size = -1; - RETURN_NOT_OK(internal::CastSize(view.size, &size)); - RETURN_NOT_OK(builder->AppendString(view.bytes, size)); - } else if (PyList_CheckExact(elem)) { - RETURN_NOT_OK(builder->AppendList(context, elem, recursion_depth, blobs_out)); - } else if (PyDict_CheckExact(elem)) { - RETURN_NOT_OK(builder->AppendDict(context, elem, recursion_depth, blobs_out)); - } else if (PyTuple_CheckExact(elem)) { - RETURN_NOT_OK(builder->AppendTuple(context, elem, recursion_depth, blobs_out)); - } else if (PySet_Check(elem)) { - RETURN_NOT_OK(builder->AppendSet(context, elem, recursion_depth, blobs_out)); - } else if (PyArray_IsScalar(elem, Generic)) { - RETURN_NOT_OK(AppendScalar(elem, builder)); - } else if (PyArray_CheckExact(elem)) { - RETURN_NOT_OK(AppendArray(context, reinterpret_cast(elem), builder, - recursion_depth, blobs_out)); - } else if (elem == Py_None) { - RETURN_NOT_OK(builder->AppendNone()); - } else if (PyDateTime_Check(elem)) { - PyDateTime_DateTime* datetime = reinterpret_cast(elem); - RETURN_NOT_OK(builder->AppendDate64(internal::PyDateTime_to_us(datetime))); - } else if (is_buffer(elem)) { - RETURN_NOT_OK(builder->AppendBuffer(static_cast(blobs_out->buffers.size()))); - ARROW_ASSIGN_OR_RAISE(auto buffer, unwrap_buffer(elem)); - blobs_out->buffers.push_back(buffer); - } else if (is_tensor(elem)) { - RETURN_NOT_OK(builder->AppendTensor(static_cast(blobs_out->tensors.size()))); - ARROW_ASSIGN_OR_RAISE(auto tensor, unwrap_tensor(elem)); - blobs_out->tensors.push_back(tensor); - } else if (is_sparse_coo_tensor(elem)) { - RETURN_NOT_OK(builder->AppendSparseCOOTensor( - static_cast(blobs_out->sparse_tensors.size()))); - ARROW_ASSIGN_OR_RAISE(auto tensor, unwrap_sparse_coo_tensor(elem)); - blobs_out->sparse_tensors.push_back(tensor); - } else if (is_sparse_csr_matrix(elem)) { - RETURN_NOT_OK(builder->AppendSparseCSRMatrix( - static_cast(blobs_out->sparse_tensors.size()))); - ARROW_ASSIGN_OR_RAISE(auto matrix, unwrap_sparse_csr_matrix(elem)); - blobs_out->sparse_tensors.push_back(matrix); - } else if (is_sparse_csc_matrix(elem)) { - RETURN_NOT_OK(builder->AppendSparseCSCMatrix( - static_cast(blobs_out->sparse_tensors.size()))); - ARROW_ASSIGN_OR_RAISE(auto matrix, unwrap_sparse_csc_matrix(elem)); - blobs_out->sparse_tensors.push_back(matrix); - } else if (is_sparse_csf_tensor(elem)) { - RETURN_NOT_OK(builder->AppendSparseCSFTensor( - static_cast(blobs_out->sparse_tensors.size()))); - ARROW_ASSIGN_OR_RAISE(auto tensor, unwrap_sparse_csf_tensor(elem)); - blobs_out->sparse_tensors.push_back(tensor); - } else { - // Attempt to serialize the object using the custom callback. - PyObject* serialized_object; - // The reference count of serialized_object will be decremented in SerializeDict - RETURN_NOT_OK(CallSerializeCallback(context, elem, &serialized_object)); - RETURN_NOT_OK( - builder->AppendDict(context, serialized_object, recursion_depth, blobs_out)); - } - return Status::OK(); -} - -Status AppendArray(PyObject* context, PyArrayObject* array, SequenceBuilder* builder, - int32_t recursion_depth, SerializedPyObject* blobs_out) { - int dtype = PyArray_TYPE(array); - switch (dtype) { - case NPY_UINT8: - case NPY_INT8: - case NPY_UINT16: - case NPY_INT16: - case NPY_UINT32: - case NPY_INT32: - case NPY_UINT64: - case NPY_INT64: - case NPY_HALF: - case NPY_FLOAT: - case NPY_DOUBLE: { - RETURN_NOT_OK( - builder->AppendNdarray(static_cast(blobs_out->ndarrays.size()))); - std::shared_ptr tensor; - RETURN_NOT_OK(NdarrayToTensor(default_memory_pool(), - reinterpret_cast(array), {}, &tensor)); - blobs_out->ndarrays.push_back(tensor); - } break; - default: { - PyObject* serialized_object; - // The reference count of serialized_object will be decremented in SerializeDict - RETURN_NOT_OK(CallSerializeCallback(context, reinterpret_cast(array), - &serialized_object)); - RETURN_NOT_OK(builder->AppendDict(context, serialized_object, recursion_depth + 1, - blobs_out)); - } - } - return Status::OK(); -} - -std::shared_ptr MakeBatch(std::shared_ptr data) { - auto field = std::make_shared("list", data->type()); - auto schema = ::arrow::schema({field}); - return RecordBatch::Make(schema, data->length(), {data}); -} - -Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out) { - PyAcquireGIL lock; - SequenceBuilder builder; - RETURN_NOT_OK(internal::VisitIterable( - sequence, [&](PyObject* obj, bool* keep_going /* unused */) { - return Append(context, obj, &builder, 0, out); - })); - std::shared_ptr array; - RETURN_NOT_OK(builder.Finish(&array)); - out->batch = MakeBatch(array); - return Status::OK(); -} - -Status SerializeNdarray(std::shared_ptr tensor, SerializedPyObject* out) { - std::shared_ptr array; - SequenceBuilder builder; - RETURN_NOT_OK(builder.AppendNdarray(static_cast(out->ndarrays.size()))); - out->ndarrays.push_back(tensor); - RETURN_NOT_OK(builder.Finish(&array)); - out->batch = MakeBatch(array); - return Status::OK(); -} - -Status WriteNdarrayHeader(std::shared_ptr dtype, - const std::vector& shape, int64_t tensor_num_bytes, - io::OutputStream* dst) { - auto empty_tensor = std::make_shared( - dtype, std::make_shared(nullptr, tensor_num_bytes), shape); - SerializedPyObject serialized_tensor; - RETURN_NOT_OK(SerializeNdarray(empty_tensor, &serialized_tensor)); - return serialized_tensor.WriteTo(dst); -} - -SerializedPyObject::SerializedPyObject() - : ipc_options(ipc::IpcWriteOptions::Defaults()) {} - -Status SerializedPyObject::WriteTo(io::OutputStream* dst) { - int32_t num_tensors = static_cast(this->tensors.size()); - int32_t num_sparse_tensors = static_cast(this->sparse_tensors.size()); - int32_t num_ndarrays = static_cast(this->ndarrays.size()); - int32_t num_buffers = static_cast(this->buffers.size()); - RETURN_NOT_OK( - dst->Write(reinterpret_cast(&num_tensors), sizeof(int32_t))); - RETURN_NOT_OK( - dst->Write(reinterpret_cast(&num_sparse_tensors), sizeof(int32_t))); - RETURN_NOT_OK( - dst->Write(reinterpret_cast(&num_ndarrays), sizeof(int32_t))); - RETURN_NOT_OK( - dst->Write(reinterpret_cast(&num_buffers), sizeof(int32_t))); - - // Align stream to 8-byte offset - RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kArrowIpcAlignment)); - RETURN_NOT_OK(ipc::WriteRecordBatchStream({this->batch}, this->ipc_options, dst)); - - // Align stream to 64-byte offset so tensor bodies are 64-byte aligned - RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment)); - - int32_t metadata_length; - int64_t body_length; - for (const auto& tensor : this->tensors) { - RETURN_NOT_OK(ipc::WriteTensor(*tensor, dst, &metadata_length, &body_length)); - RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment)); - } - - for (const auto& sparse_tensor : this->sparse_tensors) { - RETURN_NOT_OK( - ipc::WriteSparseTensor(*sparse_tensor, dst, &metadata_length, &body_length)); - RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment)); - } - - for (const auto& tensor : this->ndarrays) { - RETURN_NOT_OK(ipc::WriteTensor(*tensor, dst, &metadata_length, &body_length)); - RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment)); - } - - for (const auto& buffer : this->buffers) { - int64_t size = buffer->size(); - RETURN_NOT_OK(dst->Write(reinterpret_cast(&size), sizeof(int64_t))); - RETURN_NOT_OK(dst->Write(buffer->data(), size)); - } - - return Status::OK(); -} - -namespace { - -Status CountSparseTensors( - const std::vector>& sparse_tensors, PyObject** out) { - OwnedRef num_sparse_tensors(PyDict_New()); - size_t num_coo = 0; - size_t num_csr = 0; - size_t num_csc = 0; - size_t num_csf = 0; - size_t ndim_csf = 0; - - for (const auto& sparse_tensor : sparse_tensors) { - switch (sparse_tensor->format_id()) { - case SparseTensorFormat::COO: - ++num_coo; - break; - case SparseTensorFormat::CSR: - ++num_csr; - break; - case SparseTensorFormat::CSC: - ++num_csc; - break; - case SparseTensorFormat::CSF: - ++num_csf; - ndim_csf += sparse_tensor->ndim(); - break; - } - } - - PyDict_SetItemString(num_sparse_tensors.obj(), "coo", PyLong_FromSize_t(num_coo)); - PyDict_SetItemString(num_sparse_tensors.obj(), "csr", PyLong_FromSize_t(num_csr)); - PyDict_SetItemString(num_sparse_tensors.obj(), "csc", PyLong_FromSize_t(num_csc)); - PyDict_SetItemString(num_sparse_tensors.obj(), "csf", PyLong_FromSize_t(num_csf)); - PyDict_SetItemString(num_sparse_tensors.obj(), "ndim_csf", PyLong_FromSize_t(ndim_csf)); - RETURN_IF_PYERROR(); - - *out = num_sparse_tensors.detach(); - return Status::OK(); -} - -} // namespace - -Status SerializedPyObject::GetComponents(MemoryPool* memory_pool, PyObject** out) { - PyAcquireGIL py_gil; - - OwnedRef result(PyDict_New()); - PyObject* buffers = PyList_New(0); - PyObject* num_sparse_tensors = nullptr; - - // TODO(wesm): Not sure how pedantic we need to be about checking the return - // values of these functions. There are other places where we do not check - // PyDict_SetItem/SetItemString return value, but these failures would be - // quite esoteric - PyDict_SetItemString(result.obj(), "num_tensors", - PyLong_FromSize_t(this->tensors.size())); - RETURN_NOT_OK(CountSparseTensors(this->sparse_tensors, &num_sparse_tensors)); - PyDict_SetItemString(result.obj(), "num_sparse_tensors", num_sparse_tensors); - PyDict_SetItemString(result.obj(), "ndim_csf", num_sparse_tensors); - PyDict_SetItemString(result.obj(), "num_ndarrays", - PyLong_FromSize_t(this->ndarrays.size())); - PyDict_SetItemString(result.obj(), "num_buffers", - PyLong_FromSize_t(this->buffers.size())); - PyDict_SetItemString(result.obj(), "data", buffers); - RETURN_IF_PYERROR(); - - Py_DECREF(buffers); - - auto PushBuffer = [&buffers](const std::shared_ptr& buffer) { - PyObject* wrapped_buffer = wrap_buffer(buffer); - RETURN_IF_PYERROR(); - if (PyList_Append(buffers, wrapped_buffer) < 0) { - Py_DECREF(wrapped_buffer); - RETURN_IF_PYERROR(); - } - Py_DECREF(wrapped_buffer); - return Status::OK(); - }; - - constexpr int64_t kInitialCapacity = 1024; - - // Write the record batch describing the object structure - py_gil.release(); - ARROW_ASSIGN_OR_RAISE(auto stream, - io::BufferOutputStream::Create(kInitialCapacity, memory_pool)); - RETURN_NOT_OK( - ipc::WriteRecordBatchStream({this->batch}, this->ipc_options, stream.get())); - ARROW_ASSIGN_OR_RAISE(auto buffer, stream->Finish()); - py_gil.acquire(); - - RETURN_NOT_OK(PushBuffer(buffer)); - - // For each tensor, get a metadata buffer and a buffer for the body - for (const auto& tensor : this->tensors) { - ARROW_ASSIGN_OR_RAISE(std::unique_ptr message, - ipc::GetTensorMessage(*tensor, memory_pool)); - RETURN_NOT_OK(PushBuffer(message->metadata())); - RETURN_NOT_OK(PushBuffer(message->body())); - } - - // For each sparse tensor, get a metadata buffer and buffers containing index and data - for (const auto& sparse_tensor : this->sparse_tensors) { - ipc::IpcPayload payload; - RETURN_NOT_OK(ipc::GetSparseTensorPayload(*sparse_tensor, memory_pool, &payload)); - RETURN_NOT_OK(PushBuffer(payload.metadata)); - for (const auto& body : payload.body_buffers) { - RETURN_NOT_OK(PushBuffer(body)); - } - } - - // For each ndarray, get a metadata buffer and a buffer for the body - for (const auto& ndarray : this->ndarrays) { - ARROW_ASSIGN_OR_RAISE(std::unique_ptr message, - ipc::GetTensorMessage(*ndarray, memory_pool)); - RETURN_NOT_OK(PushBuffer(message->metadata())); - RETURN_NOT_OK(PushBuffer(message->body())); - } - - for (const auto& buf : this->buffers) { - RETURN_NOT_OK(PushBuffer(buf)); - } - - *out = result.detach(); - return Status::OK(); -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.h b/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.h deleted file mode 100644 index fd207d3..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.h +++ /dev/null @@ -1,145 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/ipc/options.h" -#include "arrow/python/visibility.h" -#include "arrow/sparse_tensor.h" -#include "arrow/status.h" - -// Forward declaring PyObject, see -// https://mail.python.org/pipermail/python-dev/2003-August/037601.html -#ifndef PyObject_HEAD -struct _object; -typedef _object PyObject; -#endif - -namespace arrow { - -class Buffer; -class DataType; -class MemoryPool; -class RecordBatch; -class Tensor; - -namespace io { - -class OutputStream; - -} // namespace io - -namespace py { - -struct ARROW_PYTHON_EXPORT SerializedPyObject { - std::shared_ptr batch; - std::vector> tensors; - std::vector> sparse_tensors; - std::vector> ndarrays; - std::vector> buffers; - ipc::IpcWriteOptions ipc_options; - - SerializedPyObject(); - - /// \brief Write serialized Python object to OutputStream - /// \param[in,out] dst an OutputStream - /// \return Status - Status WriteTo(io::OutputStream* dst); - - /// \brief Convert SerializedPyObject to a dict containing the message - /// components as Buffer instances with minimal memory allocation - /// - /// { - /// 'num_tensors': M, - /// 'num_sparse_tensors': N, - /// 'num_buffers': K, - /// 'data': [Buffer] - /// } - /// - /// Each tensor is written as two buffers, one for the metadata and one for - /// the body. Therefore, the number of buffers in 'data' is 2 * M + 2 * N + K + 1, - /// with the first buffer containing the serialized record batch containing - /// the UnionArray that describes the whole object - Status GetComponents(MemoryPool* pool, PyObject** out); -}; - -/// \brief Serialize Python sequence as a SerializedPyObject. -/// \param[in] context Serialization context which contains custom serialization -/// and deserialization callbacks. Can be any Python object with a -/// _serialize_callback method for serialization and a _deserialize_callback -/// method for deserialization. If context is None, no custom serialization -/// will be attempted. -/// \param[in] sequence A Python sequence object to serialize to Arrow data -/// structures -/// \param[out] out The serialized representation -/// \return Status -/// -/// Release GIL before calling -ARROW_PYTHON_EXPORT -Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out); - -/// \brief Serialize an Arrow Tensor as a SerializedPyObject. -/// \param[in] tensor Tensor to be serialized -/// \param[out] out The serialized representation -/// \return Status -ARROW_PYTHON_EXPORT -Status SerializeTensor(std::shared_ptr tensor, py::SerializedPyObject* out); - -/// \brief Write the Tensor metadata header to an OutputStream. -/// \param[in] dtype DataType of the Tensor -/// \param[in] shape The shape of the tensor -/// \param[in] tensor_num_bytes The length of the Tensor data in bytes -/// \param[in] dst The OutputStream to write the Tensor header to -/// \return Status -ARROW_PYTHON_EXPORT -Status WriteNdarrayHeader(std::shared_ptr dtype, - const std::vector& shape, int64_t tensor_num_bytes, - io::OutputStream* dst); - -struct PythonType { - enum type { - NONE, - BOOL, - INT, - PY2INT, // Kept for compatibility - BYTES, - STRING, - HALF_FLOAT, - FLOAT, - DOUBLE, - DATE64, - LIST, - DICT, - TUPLE, - SET, - TENSOR, - NDARRAY, - BUFFER, - SPARSECOOTENSOR, - SPARSECSRMATRIX, - SPARSECSCMATRIX, - SPARSECSFTENSOR, - NUM_PYTHON_TYPES - }; -}; - -} // namespace py - -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/type_traits.h b/src/vendored/apache-arrow-12.0.1/arrow/python/type_traits.h deleted file mode 100644 index a941577..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/type_traits.h +++ /dev/null @@ -1,350 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Internal header - -#pragma once - -#include "arrow/python/platform.h" - -#include -#include - -#include "arrow/python/numpy_interop.h" - -#include - -#include "arrow/type_fwd.h" -#include "arrow/util/logging.h" - -namespace arrow { -namespace py { - -static constexpr int64_t kPandasTimestampNull = std::numeric_limits::min(); -constexpr int64_t kNanosecondsInDay = 86400000000000LL; - -namespace internal { - -// -// Type traits for Numpy -> Arrow equivalence -// -template -struct npy_traits {}; - -template <> -struct npy_traits { - typedef uint8_t value_type; - using TypeClass = BooleanType; - using BuilderClass = BooleanBuilder; - - static constexpr bool supports_nulls = false; - static inline bool isnull(uint8_t v) { return false; } -}; - -#define NPY_INT_DECL(TYPE, CapType, T) \ - template <> \ - struct npy_traits { \ - typedef T value_type; \ - using TypeClass = CapType##Type; \ - using BuilderClass = CapType##Builder; \ - \ - static constexpr bool supports_nulls = false; \ - static inline bool isnull(T v) { return false; } \ - }; - -NPY_INT_DECL(INT8, Int8, int8_t); -NPY_INT_DECL(INT16, Int16, int16_t); -NPY_INT_DECL(INT32, Int32, int32_t); -NPY_INT_DECL(INT64, Int64, int64_t); - -NPY_INT_DECL(UINT8, UInt8, uint8_t); -NPY_INT_DECL(UINT16, UInt16, uint16_t); -NPY_INT_DECL(UINT32, UInt32, uint32_t); -NPY_INT_DECL(UINT64, UInt64, uint64_t); - -#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32 -NPY_INT_DECL(INT, Int32, int32_t); -NPY_INT_DECL(UINT, UInt32, uint32_t); -#endif -#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64 -NPY_INT_DECL(LONGLONG, Int64, int64_t); -NPY_INT_DECL(ULONGLONG, UInt64, uint64_t); -#endif - -template <> -struct npy_traits { - typedef npy_half value_type; - using TypeClass = HalfFloatType; - using BuilderClass = HalfFloatBuilder; - - static constexpr npy_half na_sentinel = NPY_HALF_NAN; - - static constexpr bool supports_nulls = true; - - static inline bool isnull(npy_half v) { return v == NPY_HALF_NAN; } -}; - -template <> -struct npy_traits { - typedef float value_type; - using TypeClass = FloatType; - using BuilderClass = FloatBuilder; - - // We need to use quiet_NaN here instead of the NAN macro as on Windows - // the NAN macro leads to "division-by-zero" compile-time error with clang. - static constexpr float na_sentinel = std::numeric_limits::quiet_NaN(); - - static constexpr bool supports_nulls = true; - - static inline bool isnull(float v) { return v != v; } -}; - -template <> -struct npy_traits { - typedef double value_type; - using TypeClass = DoubleType; - using BuilderClass = DoubleBuilder; - - static constexpr double na_sentinel = std::numeric_limits::quiet_NaN(); - - static constexpr bool supports_nulls = true; - - static inline bool isnull(double v) { return v != v; } -}; - -template <> -struct npy_traits { - typedef int64_t value_type; - using TypeClass = TimestampType; - using BuilderClass = TimestampBuilder; - - static constexpr bool supports_nulls = true; - - static inline bool isnull(int64_t v) { - // NaT = -2**63 - // = -0x8000000000000000 - // = -9223372036854775808; - // = std::numeric_limits::min() - return v == std::numeric_limits::min(); - } -}; - -template <> -struct npy_traits { - typedef int64_t value_type; - using TypeClass = DurationType; - using BuilderClass = DurationBuilder; - - static constexpr bool supports_nulls = true; - - static inline bool isnull(int64_t v) { - // NaT = -2**63 = std::numeric_limits::min() - return v == std::numeric_limits::min(); - } -}; - -template <> -struct npy_traits { - typedef PyObject* value_type; - static constexpr bool supports_nulls = true; - - static inline bool isnull(PyObject* v) { return v == Py_None; } -}; - -// -// Type traits for Arrow -> Numpy equivalence -// Note *supports_nulls* means the equivalent Numpy type support nulls -// -template -struct arrow_traits {}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_BOOL; - static constexpr bool supports_nulls = false; - typedef typename npy_traits::value_type T; -}; - -#define INT_DECL(TYPE) \ - template <> \ - struct arrow_traits { \ - static constexpr int npy_type = NPY_##TYPE; \ - static constexpr bool supports_nulls = false; \ - static constexpr double na_value = std::numeric_limits::quiet_NaN(); \ - typedef typename npy_traits::value_type T; \ - }; - -INT_DECL(INT8); -INT_DECL(INT16); -INT_DECL(INT32); -INT_DECL(INT64); -INT_DECL(UINT8); -INT_DECL(UINT16); -INT_DECL(UINT32); -INT_DECL(UINT64); - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_FLOAT16; - static constexpr bool supports_nulls = true; - static constexpr uint16_t na_value = NPY_HALF_NAN; - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_FLOAT32; - static constexpr bool supports_nulls = true; - static constexpr float na_value = std::numeric_limits::quiet_NaN(); - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_FLOAT64; - static constexpr bool supports_nulls = true; - static constexpr double na_value = std::numeric_limits::quiet_NaN(); - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_DATETIME; - static constexpr int64_t npy_shift = 1; - - static constexpr bool supports_nulls = true; - static constexpr int64_t na_value = kPandasTimestampNull; - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_TIMEDELTA; - static constexpr int64_t npy_shift = 1; - - static constexpr bool supports_nulls = true; - static constexpr int64_t na_value = kPandasTimestampNull; - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - // Data stores as FR_D day unit - static constexpr int npy_type = NPY_DATETIME; - static constexpr int64_t npy_shift = 1; - - static constexpr bool supports_nulls = true; - typedef typename npy_traits::value_type T; - - static constexpr int64_t na_value = kPandasTimestampNull; - static inline bool isnull(int64_t v) { return npy_traits::isnull(v); } -}; - -template <> -struct arrow_traits { - // Data stores as FR_D day unit - static constexpr int npy_type = NPY_DATETIME; - - // There are 1000 * 60 * 60 * 24 = 86400000ms in a day - static constexpr int64_t npy_shift = 86400000; - - static constexpr bool supports_nulls = true; - typedef typename npy_traits::value_type T; - - static constexpr int64_t na_value = kPandasTimestampNull; - static inline bool isnull(int64_t v) { return npy_traits::isnull(v); } -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_OBJECT; - static constexpr bool supports_nulls = true; - static constexpr int64_t na_value = kPandasTimestampNull; - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_OBJECT; - static constexpr bool supports_nulls = true; - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_OBJECT; - static constexpr bool supports_nulls = true; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_OBJECT; - static constexpr bool supports_nulls = true; -}; - -static inline NPY_DATETIMEUNIT NumPyFrequency(TimeUnit::type unit) { - switch (unit) { - case TimestampType::Unit::SECOND: - return NPY_FR_s; - case TimestampType::Unit::MILLI: - return NPY_FR_ms; - break; - case TimestampType::Unit::MICRO: - return NPY_FR_us; - default: - // NANO - return NPY_FR_ns; - } -} - -static inline int NumPyTypeSize(int npy_type) { - npy_type = fix_numpy_type_num(npy_type); - - switch (npy_type) { - case NPY_BOOL: - case NPY_INT8: - case NPY_UINT8: - return 1; - case NPY_INT16: - case NPY_UINT16: - return 2; - case NPY_INT32: - case NPY_UINT32: - return 4; - case NPY_INT64: - case NPY_UINT64: - return 8; - case NPY_FLOAT16: - return 2; - case NPY_FLOAT32: - return 4; - case NPY_FLOAT64: - return 8; - case NPY_DATETIME: - return 8; - case NPY_OBJECT: - return sizeof(void*); - default: - ARROW_CHECK(false) << "unhandled numpy type"; - break; - } - return -1; -} - -} // namespace internal -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/udf.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/udf.cc deleted file mode 100644 index 435c89f..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/udf.cc +++ /dev/null @@ -1,736 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/udf.h" -#include "arrow/array/builder_base.h" -#include "arrow/buffer_builder.h" -#include "arrow/compute/api_aggregate.h" -#include "arrow/compute/api_vector.h" -#include "arrow/compute/function.h" -#include "arrow/compute/kernel.h" -#include "arrow/compute/row/grouper.h" -#include "arrow/python/common.h" -#include "arrow/table.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/logging.h" - -namespace arrow { -using compute::ExecSpan; -using compute::Grouper; -using compute::KernelContext; -using compute::KernelState; -using internal::checked_cast; - -namespace py { -namespace { - -struct PythonUdfKernelState : public compute::KernelState { - explicit PythonUdfKernelState(std::shared_ptr function) - : function(function) { - Py_INCREF(function->obj()); - } - - // function needs to be destroyed at process exit - // and Python may no longer be initialized. - ~PythonUdfKernelState() { - if (_Py_IsFinalizing()) { - function->detach(); - } - } - - std::shared_ptr function; -}; - -struct PythonUdfKernelInit { - explicit PythonUdfKernelInit(std::shared_ptr function) - : function(function) { - Py_INCREF(function->obj()); - } - - // function needs to be destroyed at process exit - // and Python may no longer be initialized. - ~PythonUdfKernelInit() { - if (_Py_IsFinalizing()) { - function->detach(); - } - } - - Result> operator()( - compute::KernelContext*, const compute::KernelInitArgs&) { - return std::make_unique(function); - } - - std::shared_ptr function; -}; - -struct ScalarUdfAggregator : public compute::KernelState { - virtual Status Consume(compute::KernelContext* ctx, const compute::ExecSpan& batch) = 0; - virtual Status MergeFrom(compute::KernelContext* ctx, compute::KernelState&& src) = 0; - virtual Status Finalize(compute::KernelContext* ctx, Datum* out) = 0; -}; - -struct HashUdfAggregator : public compute::KernelState { - virtual Status Resize(KernelContext* ctx, int64_t size) = 0; - virtual Status Consume(KernelContext* ctx, const ExecSpan& batch) = 0; - virtual Status Merge(KernelContext* ct, KernelState&& other, const ArrayData&) = 0; - virtual Status Finalize(KernelContext* ctx, Datum* out) = 0; -}; - -arrow::Status AggregateUdfConsume(compute::KernelContext* ctx, - const compute::ExecSpan& batch) { - return checked_cast(ctx->state())->Consume(ctx, batch); -} - -arrow::Status AggregateUdfMerge(compute::KernelContext* ctx, compute::KernelState&& src, - compute::KernelState* dst) { - return checked_cast(dst)->MergeFrom(ctx, std::move(src)); -} - -arrow::Status AggregateUdfFinalize(compute::KernelContext* ctx, arrow::Datum* out) { - return checked_cast(ctx->state())->Finalize(ctx, out); -} - -arrow::Status HashAggregateUdfResize(KernelContext* ctx, int64_t size) { - return checked_cast(ctx->state())->Resize(ctx, size); -} - -arrow::Status HashAggregateUdfConsume(KernelContext* ctx, const ExecSpan& batch) { - return checked_cast(ctx->state())->Consume(ctx, batch); -} - -arrow::Status HashAggregateUdfMerge(KernelContext* ctx, KernelState&& src, - const ArrayData& group_id_mapping) { - return checked_cast(ctx->state()) - ->Merge(ctx, std::move(src), group_id_mapping); -} - -arrow::Status HashAggregateUdfFinalize(KernelContext* ctx, Datum* out) { - return checked_cast(ctx->state())->Finalize(ctx, out); -} - -struct PythonTableUdfKernelInit { - PythonTableUdfKernelInit(std::shared_ptr function_maker, - UdfWrapperCallback cb) - : function_maker(function_maker), cb(cb) { - Py_INCREF(function_maker->obj()); - } - - // function needs to be destroyed at process exit - // and Python may no longer be initialized. - ~PythonTableUdfKernelInit() { - if (_Py_IsFinalizing()) { - function_maker->detach(); - } - } - - Result> operator()( - compute::KernelContext* ctx, const compute::KernelInitArgs&) { - UdfContext udf_context{ctx->memory_pool(), /*batch_length=*/0}; - std::unique_ptr function; - RETURN_NOT_OK(SafeCallIntoPython([this, &udf_context, &function] { - OwnedRef empty_tuple(PyTuple_New(0)); - function = std::make_unique( - cb(function_maker->obj(), udf_context, empty_tuple.obj())); - RETURN_NOT_OK(CheckPyError()); - return Status::OK(); - })); - if (!PyCallable_Check(function->obj())) { - return Status::TypeError("Expected a callable Python object."); - } - return std::make_unique(std::move(function)); - } - - std::shared_ptr function_maker; - UdfWrapperCallback cb; -}; - -struct PythonUdfScalarAggregatorImpl : public ScalarUdfAggregator { - PythonUdfScalarAggregatorImpl(std::shared_ptr function, - UdfWrapperCallback cb, - std::vector> input_types, - std::shared_ptr output_type) - : function(function), cb(std::move(cb)), output_type(std::move(output_type)) { - Py_INCREF(function->obj()); - std::vector> fields; - for (size_t i = 0; i < input_types.size(); i++) { - fields.push_back(field("", input_types[i])); - } - input_schema = schema(std::move(fields)); - }; - - ~PythonUdfScalarAggregatorImpl() override { - if (_Py_IsFinalizing()) { - function->detach(); - } - } - - Status Consume(compute::KernelContext* ctx, const compute::ExecSpan& batch) override { - ARROW_ASSIGN_OR_RAISE( - auto rb, batch.ToExecBatch().ToRecordBatch(input_schema, ctx->memory_pool())); - values.push_back(std::move(rb)); - return Status::OK(); - } - - Status MergeFrom(compute::KernelContext* ctx, compute::KernelState&& src) override { - auto& other_values = checked_cast(src).values; - values.insert(values.end(), std::make_move_iterator(other_values.begin()), - std::make_move_iterator(other_values.end())); - - other_values.erase(other_values.begin(), other_values.end()); - return Status::OK(); - } - - Status Finalize(compute::KernelContext* ctx, Datum* out) override { - auto state = - arrow::internal::checked_cast(ctx->state()); - const int num_args = input_schema->num_fields(); - - // Note: The way that batches are concatenated together - // would result in using double amount of the memory. - // This is OK for now because non decomposable aggregate - // UDF is supposed to be used with segmented aggregation - // where the size of the segment is more or less constant - // so doubling that is not a big deal. This can be also - // improved in the future to use more efficient way to - // concatenate. - ARROW_ASSIGN_OR_RAISE(auto table, - arrow::Table::FromRecordBatches(input_schema, values)); - ARROW_ASSIGN_OR_RAISE(table, table->CombineChunks(ctx->memory_pool())); - UdfContext udf_context{ctx->memory_pool(), table->num_rows()}; - - if (table->num_rows() == 0) { - return Status::Invalid("Finalized is called with empty inputs"); - } - - RETURN_NOT_OK(SafeCallIntoPython([&] { - std::unique_ptr result; - OwnedRef arg_tuple(PyTuple_New(num_args)); - RETURN_NOT_OK(CheckPyError()); - - for (int arg_id = 0; arg_id < num_args; arg_id++) { - // Since we combined chunks there is only one chunk - std::shared_ptr c_data = table->column(arg_id)->chunk(0); - PyObject* data = wrap_array(c_data); - PyTuple_SetItem(arg_tuple.obj(), arg_id, data); - } - result = - std::make_unique(cb(function->obj(), udf_context, arg_tuple.obj())); - RETURN_NOT_OK(CheckPyError()); - // unwrapping the output for expected output type - if (is_scalar(result->obj())) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr val, unwrap_scalar(result->obj())); - if (*output_type != *val->type) { - return Status::TypeError("Expected output datatype ", output_type->ToString(), - ", but function returned datatype ", - val->type->ToString()); - } - out->value = std::move(val); - return Status::OK(); - } - return Status::TypeError("Unexpected output type: ", - Py_TYPE(result->obj())->tp_name, " (expected Scalar)"); - })); - return Status::OK(); - } - - std::shared_ptr function; - UdfWrapperCallback cb; - std::vector> values; - std::shared_ptr input_schema; - std::shared_ptr output_type; -}; - -struct PythonUdfHashAggregatorImpl : public HashUdfAggregator { - PythonUdfHashAggregatorImpl(std::shared_ptr function, - UdfWrapperCallback cb, - std::vector> input_types, - std::shared_ptr output_type) - : function(function), cb(std::move(cb)), output_type(std::move(output_type)) { - Py_INCREF(function->obj()); - std::vector> fields; - fields.reserve(input_types.size()); - for (size_t i = 0; i < input_types.size(); i++) { - fields.push_back(field("", input_types[i])); - } - input_schema = schema(std::move(fields)); - }; - - ~PythonUdfHashAggregatorImpl() override { - if (_Py_IsFinalizing()) { - function->detach(); - } - } - - // same as ApplyGrouping in parition.cc - // replicated the code here to avoid complicating the dependencies - static Result ApplyGroupings( - const ListArray& groupings, const std::shared_ptr& batch) { - ARROW_ASSIGN_OR_RAISE(Datum sorted, - compute::Take(batch, groupings.data()->child_data[0])); - - const auto& sorted_batch = *sorted.record_batch(); - - RecordBatchVector out(static_cast(groupings.length())); - for (size_t i = 0; i < out.size(); ++i) { - out[i] = sorted_batch.Slice(groupings.value_offset(i), groupings.value_length(i)); - } - - return out; - } - - Status Resize(KernelContext* ctx, int64_t new_num_groups) { - // We only need to change num_groups in resize - // similar to other hash aggregate kernels - num_groups = new_num_groups; - return Status::OK(); - } - - Status Consume(KernelContext* ctx, const ExecSpan& batch) { - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr rb, - batch.ToExecBatch().ToRecordBatch(input_schema, ctx->memory_pool())); - - // This is similar to GroupedListImpl - // last array is the group id - const ArraySpan& groups_array_data = batch[batch.num_values() - 1].array; - DCHECK_EQ(groups_array_data.offset, 0); - int64_t batch_num_values = groups_array_data.length; - const auto* batch_groups = groups_array_data.GetValues(1); - RETURN_NOT_OK(groups.Append(batch_groups, batch_num_values)); - values.push_back(std::move(rb)); - num_values += batch_num_values; - return Status::OK(); - } - Status Merge(KernelContext* ctx, KernelState&& other_state, - const ArrayData& group_id_mapping) { - // This is similar to GroupedListImpl - auto& other = checked_cast(other_state); - auto& other_values = other.values; - const uint32_t* other_raw_groups = other.groups.data(); - values.insert(values.end(), std::make_move_iterator(other_values.begin()), - std::make_move_iterator(other_values.end())); - - auto g = group_id_mapping.GetValues(1); - for (uint32_t other_g = 0; static_cast(other_g) < other.num_values; - ++other_g) { - // Different state can have different group_id mappings, so we - // need to translate the ids - RETURN_NOT_OK(groups.Append(g[other_raw_groups[other_g]])); - } - - num_values += other.num_values; - return Status::OK(); - } - - Status Finalize(KernelContext* ctx, Datum* out) { - // Exclude the last column which is the group id - const int num_args = input_schema->num_fields() - 1; - - ARROW_ASSIGN_OR_RAISE(auto groups_buffer, groups.Finish()); - ARROW_ASSIGN_OR_RAISE(auto groupings, - Grouper::MakeGroupings(UInt32Array(num_values, groups_buffer), - static_cast(num_groups))); - - ARROW_ASSIGN_OR_RAISE(auto table, - arrow::Table::FromRecordBatches(input_schema, values)); - ARROW_ASSIGN_OR_RAISE(auto rb, table->CombineChunksToBatch(ctx->memory_pool())); - UdfContext udf_context{ctx->memory_pool(), table->num_rows()}; - - if (rb->num_rows() == 0) { - *out = Datum(); - return Status::OK(); - } - - ARROW_ASSIGN_OR_RAISE(RecordBatchVector rbs, ApplyGroupings(*groupings, rb)); - - return SafeCallIntoPython([&] { - ARROW_ASSIGN_OR_RAISE(std::unique_ptr builder, - MakeBuilder(output_type, ctx->memory_pool())); - for (auto& group_rb : rbs) { - std::unique_ptr result; - OwnedRef arg_tuple(PyTuple_New(num_args)); - RETURN_NOT_OK(CheckPyError()); - - for (int arg_id = 0; arg_id < num_args; arg_id++) { - // Since we combined chunks there is only one chunk - std::shared_ptr c_data = group_rb->column(arg_id); - PyObject* data = wrap_array(c_data); - PyTuple_SetItem(arg_tuple.obj(), arg_id, data); - } - - result = - std::make_unique(cb(function->obj(), udf_context, arg_tuple.obj())); - RETURN_NOT_OK(CheckPyError()); - - // unwrapping the output for expected output type - if (is_scalar(result->obj())) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr val, - unwrap_scalar(result->obj())); - if (*output_type != *val->type) { - return Status::TypeError("Expected output datatype ", output_type->ToString(), - ", but function returned datatype ", - val->type->ToString()); - } - ARROW_RETURN_NOT_OK(builder->AppendScalar(std::move(*val))); - } else { - return Status::TypeError("Unexpected output type: ", - Py_TYPE(result->obj())->tp_name, " (expected Scalar)"); - } - } - ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); - out->value = std::move(result->data()); - return Status::OK(); - }); - } - - std::shared_ptr function; - UdfWrapperCallback cb; - // Accumulated input batches - std::vector> values; - // Group ids - extracted from the last column from the batch - TypedBufferBuilder groups; - int64_t num_groups = 0; - int64_t num_values = 0; - std::shared_ptr input_schema; - std::shared_ptr output_type; -}; - -struct PythonUdf : public PythonUdfKernelState { - PythonUdf(std::shared_ptr function, UdfWrapperCallback cb, - std::vector input_types, compute::OutputType output_type) - : PythonUdfKernelState(function), - cb(cb), - input_types(input_types), - output_type(output_type) {} - - UdfWrapperCallback cb; - std::vector input_types; - compute::OutputType output_type; - TypeHolder resolved_type; - - Result ResolveType(compute::KernelContext* ctx, - const std::vector& types) { - if (input_types == types) { - if (!resolved_type) { - ARROW_ASSIGN_OR_RAISE(resolved_type, output_type.Resolve(ctx, input_types)); - } - return resolved_type; - } - return output_type.Resolve(ctx, types); - } - - Status Exec(compute::KernelContext* ctx, const compute::ExecSpan& batch, - compute::ExecResult* out) { - auto state = arrow::internal::checked_cast(ctx->state()); - std::shared_ptr& function = state->function; - const int num_args = batch.num_values(); - UdfContext udf_context{ctx->memory_pool(), batch.length}; - - OwnedRef arg_tuple(PyTuple_New(num_args)); - RETURN_NOT_OK(CheckPyError()); - for (int arg_id = 0; arg_id < num_args; arg_id++) { - if (batch[arg_id].is_scalar()) { - std::shared_ptr c_data = batch[arg_id].scalar->GetSharedPtr(); - PyObject* data = wrap_scalar(c_data); - PyTuple_SetItem(arg_tuple.obj(), arg_id, data); - } else { - std::shared_ptr c_data = batch[arg_id].array.ToArray(); - PyObject* data = wrap_array(c_data); - PyTuple_SetItem(arg_tuple.obj(), arg_id, data); - } - } - - OwnedRef result(cb(function->obj(), udf_context, arg_tuple.obj())); - RETURN_NOT_OK(CheckPyError()); - // unwrapping the output for expected output type - if (is_array(result.obj())) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr val, unwrap_array(result.obj())); - ARROW_ASSIGN_OR_RAISE(TypeHolder type, ResolveType(ctx, batch.GetTypes())); - if (type.type == NULLPTR) { - return Status::TypeError("expected output datatype is null"); - } - if (*type.type != *val->type()) { - return Status::TypeError("Expected output datatype ", type.type->ToString(), - ", but function returned datatype ", - val->type()->ToString()); - } - out->value = std::move(val->data()); - return Status::OK(); - } else { - return Status::TypeError("Unexpected output type: ", Py_TYPE(result.obj())->tp_name, - " (expected Array)"); - } - return Status::OK(); - } -}; - -Status PythonUdfExec(compute::KernelContext* ctx, const compute::ExecSpan& batch, - compute::ExecResult* out) { - auto udf = static_cast(ctx->kernel()->data.get()); - return SafeCallIntoPython([&]() -> Status { return udf->Exec(ctx, batch, out); }); -} - -Status RegisterUdf(PyObject* user_function, compute::KernelInit kernel_init, - UdfWrapperCallback wrapper, const UdfOptions& options, - compute::FunctionRegistry* registry) { - if (!PyCallable_Check(user_function)) { - return Status::TypeError("Expected a callable Python object."); - } - auto scalar_func = std::make_shared( - options.func_name, options.arity, options.func_doc); - Py_INCREF(user_function); - std::vector input_types; - for (const auto& in_dtype : options.input_types) { - input_types.emplace_back(in_dtype); - } - compute::OutputType output_type(options.output_type); - auto udf_data = std::make_shared( - std::make_shared(user_function), wrapper, - TypeHolder::FromTypes(options.input_types), options.output_type); - compute::ScalarKernel kernel( - compute::KernelSignature::Make(std::move(input_types), std::move(output_type), - options.arity.is_varargs), - PythonUdfExec, kernel_init); - kernel.data = std::move(udf_data); - - kernel.mem_allocation = compute::MemAllocation::NO_PREALLOCATE; - kernel.null_handling = compute::NullHandling::COMPUTED_NO_PREALLOCATE; - RETURN_NOT_OK(scalar_func->AddKernel(std::move(kernel))); - if (registry == NULLPTR) { - registry = compute::GetFunctionRegistry(); - } - RETURN_NOT_OK(registry->AddFunction(std::move(scalar_func))); - return Status::OK(); -} - -} // namespace - -Status RegisterScalarFunction(PyObject* function, UdfWrapperCallback cb, - const UdfOptions& options, - compute::FunctionRegistry* registry) { - return RegisterUdf(function, - PythonUdfKernelInit{std::make_shared(function)}, cb, - options, registry); -} - -Status RegisterTabularFunction(PyObject* function, UdfWrapperCallback cb, - const UdfOptions& options, - compute::FunctionRegistry* registry) { - if (options.arity.num_args != 0 || options.arity.is_varargs) { - return Status::NotImplemented("tabular function of non-null arity"); - } - if (options.output_type->id() != Type::type::STRUCT) { - return Status::Invalid("tabular function with non-struct output"); - } - return RegisterUdf( - function, PythonTableUdfKernelInit{std::make_shared(function), cb}, - cb, options, registry); -} - -Status RegisterScalarAggregateFunction(PyObject* function, UdfWrapperCallback cb, - const UdfOptions& options, - compute::FunctionRegistry* registry) { - if (!PyCallable_Check(function)) { - return Status::TypeError("Expected a callable Python object."); - } - - if (registry == NULLPTR) { - registry = compute::GetFunctionRegistry(); - } - - // Py_INCREF here so that once a function is registered - // its refcount gets increased by 1 and doesn't get gced - // if all existing refs are gone - Py_INCREF(function); - - static auto default_scalar_aggregate_options = - compute::ScalarAggregateOptions::Defaults(); - auto aggregate_func = std::make_shared( - options.func_name, options.arity, options.func_doc, - &default_scalar_aggregate_options); - - std::vector input_types; - for (const auto& in_dtype : options.input_types) { - input_types.emplace_back(in_dtype); - } - compute::OutputType output_type(options.output_type); - - compute::KernelInit init = [cb, function, options](compute::KernelContext* ctx, - const compute::KernelInitArgs& args) - -> Result> { - return std::make_unique( - std::make_shared(function), cb, options.input_types, - options.output_type); - }; - - auto sig = compute::KernelSignature::Make( - std::move(input_types), std::move(output_type), options.arity.is_varargs); - compute::ScalarAggregateKernel kernel(std::move(sig), std::move(init), - AggregateUdfConsume, AggregateUdfMerge, - AggregateUdfFinalize, /*ordered=*/false); - RETURN_NOT_OK(aggregate_func->AddKernel(std::move(kernel))); - RETURN_NOT_OK(registry->AddFunction(std::move(aggregate_func))); - return Status::OK(); -} - -/// \brief Create a new UdfOptions with adjustment for hash kernel -/// \param options User provided udf options -UdfOptions AdjustForHashAggregate(const UdfOptions& options) { - UdfOptions hash_options; - // Append hash_ before the function name to seperate from the scalar - // version - hash_options.func_name = "hash_" + options.func_name; - // Extend input types with group id. Group id is appended by the group - // aggregation node. Here we change both arity and input types - if (options.arity.is_varargs) { - hash_options.arity = options.arity; - } else { - hash_options.arity = compute::Arity(options.arity.num_args + 1, false); - } - // Changing the function doc shouldn't be necessarily because group id - // is not user visible, however, this is currently needed to pass the - // function validation. The name group_id_array is consistent with - // hash kernels in hash_aggregate.cc - hash_options.func_doc = options.func_doc; - hash_options.func_doc.arg_names.emplace_back("group_id_array"); - std::vector> input_dtypes = options.input_types; - input_dtypes.emplace_back(uint32()); - hash_options.input_types = std::move(input_dtypes); - hash_options.output_type = options.output_type; - return hash_options; -} - -Status RegisterHashAggregateFunction(PyObject* function, UdfWrapperCallback cb, - const UdfOptions& options, - compute::FunctionRegistry* registry) { - if (!PyCallable_Check(function)) { - return Status::TypeError("Expected a callable Python object."); - } - - if (registry == NULLPTR) { - registry = compute::GetFunctionRegistry(); - } - - // Py_INCREF here so that once a function is registered - // its refcount gets increased by 1 and doesn't get gced - // if all existing refs are gone - Py_INCREF(function); - UdfOptions hash_options = AdjustForHashAggregate(options); - - std::vector input_types; - for (const auto& in_dtype : hash_options.input_types) { - input_types.emplace_back(in_dtype); - } - compute::OutputType output_type(hash_options.output_type); - - static auto default_hash_aggregate_options = - compute::ScalarAggregateOptions::Defaults(); - auto hash_aggregate_func = std::make_shared( - hash_options.func_name, hash_options.arity, hash_options.func_doc, - &default_hash_aggregate_options); - - compute::KernelInit init = [function, cb, hash_options]( - compute::KernelContext* ctx, - const compute::KernelInitArgs& args) - -> Result> { - return std::make_unique( - std::make_shared(function), cb, hash_options.input_types, - hash_options.output_type); - }; - - auto sig = compute::KernelSignature::Make( - std::move(input_types), std::move(output_type), hash_options.arity.is_varargs); - - compute::HashAggregateKernel kernel( - std::move(sig), std::move(init), HashAggregateUdfResize, HashAggregateUdfConsume, - HashAggregateUdfMerge, HashAggregateUdfFinalize, /*ordered=*/false); - RETURN_NOT_OK(hash_aggregate_func->AddKernel(std::move(kernel))); - RETURN_NOT_OK(registry->AddFunction(std::move(hash_aggregate_func))); - return Status::OK(); -} - -Status RegisterAggregateFunction(PyObject* function, UdfWrapperCallback cb, - const UdfOptions& options, - compute::FunctionRegistry* registry) { - RETURN_NOT_OK(RegisterScalarAggregateFunction(function, cb, options, registry)); - RETURN_NOT_OK(RegisterHashAggregateFunction(function, cb, options, registry)); - - return Status::OK(); -} - -Result> CallTabularFunction( - const std::string& func_name, const std::vector& args, - compute::FunctionRegistry* registry) { - if (args.size() != 0) { - return Status::NotImplemented("non-empty arguments to tabular function"); - } - if (registry == NULLPTR) { - registry = compute::GetFunctionRegistry(); - } - ARROW_ASSIGN_OR_RAISE(auto func, registry->GetFunction(func_name)); - if (func->kind() != compute::Function::SCALAR) { - return Status::Invalid("tabular function of non-scalar kind"); - } - auto arity = func->arity(); - if (arity.num_args != 0 || arity.is_varargs) { - return Status::NotImplemented("tabular function of non-null arity"); - } - auto kernels = - arrow::internal::checked_pointer_cast(func)->kernels(); - if (kernels.size() != 1) { - return Status::NotImplemented("tabular function with non-single kernel"); - } - const compute::ScalarKernel* kernel = kernels[0]; - auto out_type = kernel->signature->out_type(); - if (out_type.kind() != compute::OutputType::FIXED) { - return Status::Invalid("tabular kernel of non-fixed kind"); - } - auto datatype = out_type.type(); - if (datatype->id() != Type::type::STRUCT) { - return Status::Invalid("tabular kernel with non-struct output"); - } - auto struct_type = arrow::internal::checked_cast(datatype.get()); - auto schema = ::arrow::schema(struct_type->fields()); - std::vector in_types; - ARROW_ASSIGN_OR_RAISE(auto func_exec, - GetFunctionExecutor(func_name, in_types, NULLPTR, registry)); - auto next_func = [schema, func_exec = std::move( - func_exec)]() -> Result> { - std::vector args; - // passed_length of -1 or 0 with args.size() of 0 leads to an empty ExecSpanIterator - // in exec.cc and to never invoking the source function, so 1 is passed instead - // TODO: GH-33612: Support batch size in user-defined tabular functions - ARROW_ASSIGN_OR_RAISE(auto datum, func_exec->Execute(args, /*passed_length=*/1)); - if (!datum.is_array()) { - return Status::Invalid("UDF result of non-array kind"); - } - std::shared_ptr array = datum.make_array(); - if (array->length() == 0) { - return IterationTraits>::End(); - } - ARROW_ASSIGN_OR_RAISE(auto batch, RecordBatch::FromStructArray(std::move(array))); - if (!schema->Equals(batch->schema())) { - return Status::Invalid("UDF result with shape not conforming to schema"); - } - return std::move(batch); - }; - return RecordBatchReader::MakeFromIterator(MakeFunctionIterator(std::move(next_func)), - schema); -} - -} // namespace py -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/udf.h b/src/vendored/apache-arrow-12.0.1/arrow/python/udf.h deleted file mode 100644 index 682cbb2..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/udf.h +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/compute/exec.h" -#include "arrow/compute/function.h" -#include "arrow/compute/registry.h" -#include "arrow/python/platform.h" -#include "arrow/record_batch.h" -#include "arrow/util/iterator.h" - -#include "arrow/python/common.h" -#include "arrow/python/pyarrow.h" -#include "arrow/python/visibility.h" - -namespace arrow { - -namespace py { - -// TODO: TODO(ARROW-16041): UDF Options are not exposed to the Python -// users. This feature will be included when extending to provide advanced -// options for the users. -struct ARROW_PYTHON_EXPORT UdfOptions { - std::string func_name; - compute::Arity arity; - compute::FunctionDoc func_doc; - std::vector> input_types; - std::shared_ptr output_type; -}; - -/// \brief A context passed as the first argument of UDF functions. -struct ARROW_PYTHON_EXPORT UdfContext { - MemoryPool* pool; - int64_t batch_length; -}; - -using UdfWrapperCallback = std::function; - -/// \brief register a Scalar user-defined-function from Python -Status ARROW_PYTHON_EXPORT RegisterScalarFunction( - PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options, - compute::FunctionRegistry* registry = NULLPTR); - -/// \brief register a Table user-defined-function from Python -Status ARROW_PYTHON_EXPORT RegisterTabularFunction( - PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options, - compute::FunctionRegistry* registry = NULLPTR); - -/// \brief register a Aggregate user-defined-function from Python -Status ARROW_PYTHON_EXPORT RegisterAggregateFunction( - PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options, - compute::FunctionRegistry* registry = NULLPTR); - -Result> ARROW_PYTHON_EXPORT -CallTabularFunction(const std::string& func_name, const std::vector& args, - compute::FunctionRegistry* registry = NULLPTR); - -} // namespace py - -} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/visibility.h b/src/vendored/apache-arrow-12.0.1/arrow/python/visibility.h deleted file mode 100644 index dd43b32..0000000 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/visibility.h +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#if defined(_WIN32) || defined(__CYGWIN__) // Windows -#if defined(_MSC_VER) -#pragma warning(disable : 4251) -#else -#pragma GCC diagnostic ignored "-Wattributes" -#endif - -#ifdef ARROW_PYTHON_STATIC -#define ARROW_PYTHON_EXPORT -#elif defined(ARROW_PYTHON_EXPORTING) -#define ARROW_PYTHON_EXPORT __declspec(dllexport) -#else -#define ARROW_PYTHON_EXPORT __declspec(dllimport) -#endif - -#else // Not Windows -#ifndef ARROW_PYTHON_EXPORT -#define ARROW_PYTHON_EXPORT __attribute__((visibility("default"))) -#endif -#endif // Non-Windows diff --git a/vcpkg.json b/vcpkg.json index eec37b5..7bfd7c4 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -2,6 +2,7 @@ "name": "main", "version-string": "latest", "dependencies": [ - "arrow" + "arrow", + "pybind11" ] } From fa23cfaeba819203f5892109a4c9f4e483b17dd3 Mon Sep 17 00:00:00 2001 From: Tim Paine <3105306+timkpaine@users.noreply.github.com> Date: Fri, 23 Feb 2024 17:02:08 -0500 Subject: [PATCH 2/4] finish pybind + typecasters --- CMakeLists.txt | 15 ++- Makefile | 6 +- arrow_python_nocopy/__init__.py | 16 ++-- arrow_python_nocopy/tests/test_all.py | 12 +-- src/apn-python/caster.h | 74 +++++++++++++++ src/apn-python/caster.hpp | 72 --------------- src/apn-python/common.cpp | 75 +++++++++++++++ src/apn-python/common.h | 16 ++++ src/apn-python/cpython.cpp | 108 +++++----------------- src/apn-python/{cpython.hpp => cpython.h} | 0 src/apn-python/pybind11.cpp | 31 ++++++- src/apn-python/pybind11.h | 25 +++++ src/apn-python/pybind11.hpp | 17 ---- src/apn/apn.cpp | 2 +- src/apn/{apn.hpp => apn.h} | 0 src/apn/bridge.cpp | 2 +- 16 files changed, 274 insertions(+), 197 deletions(-) create mode 100644 src/apn-python/caster.h delete mode 100644 src/apn-python/caster.hpp create mode 100644 src/apn-python/common.cpp create mode 100644 src/apn-python/common.h rename src/apn-python/{cpython.hpp => cpython.h} (100%) create mode 100644 src/apn-python/pybind11.h delete mode 100644 src/apn-python/pybind11.hpp rename src/apn/{apn.hpp => apn.h} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 23ef99b..4916f39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -224,7 +224,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) ######################### set( PROJECT_HDRS - "${PROJECT_SOURCE_DIR}/src/apn/apn.hpp" + "${PROJECT_SOURCE_DIR}/src/apn/apn.h" "${PROJECT_SOURCE_DIR}/src/apn/bridge.h" "${PROJECT_SOURCE_DIR}/src/apn-common/exports.h" ) @@ -263,22 +263,27 @@ if(BUILD_PYTHON) include_directories(${pybind11_INCLUDE_DIR}) # common functionality + add_library(common SHARED "${PROJECT_SOURCE_DIR}/src/apn-python/common.cpp" ${VENDORED_PYARROW_SRCS}) + # pybind11 extension pybind11_add_module(pybind11extension MODULE "${PROJECT_SOURCE_DIR}/src/apn-python/pybind11.cpp") - set_target_properties(pybind11extension PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/apn-python/pybind11.hpp") + set_target_properties(pybind11extension PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/apn-python/pybind11.h") # cpython extension add_library(cpythonextension SHARED "${PROJECT_SOURCE_DIR}/src/apn-python/cpython.cpp") - set_target_properties(cpythonextension PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/apn-python/cpython.hpp") + set_target_properties(cpythonextension PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/apn-python/cpython.h") # Link to standalone/common library - target_link_libraries(pybind11extension PRIVATE arrow-python-nocopy) - target_link_libraries(cpythonextension PRIVATE arrow-python-nocopy) + target_link_libraries(common PRIVATE arrow-python-nocopy) + target_link_libraries(pybind11extension PRIVATE common) + target_link_libraries(cpythonextension PRIVATE common) + set_property(TARGET common PROPERTY INSTALL_RPATH "${module_origin_path}") set_property(TARGET pybind11extension PROPERTY INSTALL_RPATH "${module_origin_path}") set_property(TARGET cpythonextension PROPERTY INSTALL_RPATH "${module_origin_path}") set_property(TARGET cpythonextension PROPERTY PREFIX "") # install in python module + install(TARGETS common EXPORT ArrowPythonNocopy LIBRARY DESTINATION lib PUBLIC_HEADER DESTINATION include/python) install(TARGETS pybind11extension EXPORT ArrowPythonNocopy LIBRARY DESTINATION lib PUBLIC_HEADER DESTINATION include/python) install(TARGETS cpythonextension EXPORT ArrowPythonNocopy LIBRARY DESTINATION lib PUBLIC_HEADER DESTINATION include/python) endif() diff --git a/Makefile b/Makefile index 8c31873..5ee3dfb 100644 --- a/Makefile +++ b/Makefile @@ -52,7 +52,7 @@ lint-py: python -m ruff arrow_python_nocopy lint-cpp: - clang-format --dry-run -Werror -i -style=file `find ./src -name "*.*pp"` || echo "sometimes flaky" + clang-format --dry-run -Werror -i -style=file `find ./src -name "*.{h,cpp}"` || echo "sometimes flaky" lint: lint-cpp lint-py ## Run project linters @@ -61,7 +61,7 @@ fix-py: python -m ruff arrow_python_nocopy --fix fix-cpp: - clang-format -i -style=file `find ./src -name "*.*pp"` + clang-format -i -style=file `find ./src -name "*.{h,cpp}"` fix: fix-cpp fix-py ## Run project autofixers @@ -86,7 +86,7 @@ dist-wheel: ## Create python wheel dist python setup.py bdist_wheel $(OTHER_ARGS) dist-cibw: ## Create python wheel dist with cibuildwheel - python -m cibuildwheel --output-dir dist + python -m cibuildwheel --output-dir dist dist: build dist-sdist dist-wheel ## Create python dists python -m twine check target/wheels/* diff --git a/arrow_python_nocopy/__init__.py b/arrow_python_nocopy/__init__.py index 472a0d7..21ad566 100644 --- a/arrow_python_nocopy/__init__.py +++ b/arrow_python_nocopy/__init__.py @@ -2,12 +2,12 @@ import os.path import pyarrow as pa import pandas as pd -# from .lib.pybind11extension import array_info, create_array -# from .lib.pybind11extension import schema_info, create_schema +from .lib.pybind11extension import array_info, create_array +from .lib.pybind11extension import schema_info, create_schema # from .lib.pybind11extension import table_info, create_table -from .lib.cpythonextension import array_info, create_array -from .lib.cpythonextension import schema_info, create_schema +from .lib.cpythonextension import array_info as array_info_cp, create_array as create_array_cp +from .lib.cpythonextension import schema_info as schema_info_cp, create_schema as create_schema_cp # from .lib.cpythonextension import table_info as table_info_cp, create_table as create_table_cp @@ -24,18 +24,18 @@ def _table(): def create_arrow_array_in_python(): table = _table() - array = table.columns['a'] - print(array_info(array)) + array = table['a'].combine_chunks() + return array_info(array) def create_arrow_array_in_cpp(): - return pa.Array._import_from_c_capsule(create_array()) + return pa.Array._import_from_c_capsule(*create_array()) def create_arrow_schema_in_python(): table = _table() schema = table.schema - print(schema_info(schema)) + return schema_info(schema) def create_arrow_schema_in_cpp(): diff --git a/arrow_python_nocopy/tests/test_all.py b/arrow_python_nocopy/tests/test_all.py index 3e8c2cc..6937d34 100644 --- a/arrow_python_nocopy/tests/test_all.py +++ b/arrow_python_nocopy/tests/test_all.py @@ -14,7 +14,7 @@ class TestPybind: def test_create_array_in_python(self): table = _table() - array = table.columns[0].chunk(0) + array = table['a'].combine_chunks() assert array_info(array) == '[\n 1,\n 2,\n 3\n]' def test_create_schema_in_python(self): @@ -22,18 +22,18 @@ def test_create_schema_in_python(self): assert schema_info(schema) == 'a: int32\nb: float\nc: string' def test_create_array_in_cpp(self): - array = create_array() + array = pa.Array._import_from_c_capsule(*create_array()) assert str(array) == '[\n 1,\n 2,\n 3\n]' def test_create_schema_in_cpp(self): - schema = create_schema() + schema = pa.Schema._import_from_c_capsule(create_schema()) assert str(schema) == 'a: int32\nb: float\nc: binary' class TestCPython: def test_create_array_in_python(self): table = _table() - array = table.columns[0].chunk(0) + array = table['a'].combine_chunks() assert array_info_cp(array) == '[\n 1,\n 2,\n 3\n]' def test_create_array_in_python_bad_value(self): @@ -49,9 +49,9 @@ def test_create_schema_in_python_bad_value(self): schema_info_cp("blerg") def test_create_array_in_cpp(self): - array = create_array_cp() + array = pa.Array._import_from_c_capsule(*create_array_cp()) assert str(array) == '[\n 1,\n 2,\n 3\n]' def test_create_schema_in_cpp(self): - schema = create_schema_cp() + schema = pa.Schema._import_from_c_capsule(create_schema_cp()) assert str(schema) == 'a: int32\nb: float\nc: binary' diff --git a/src/apn-python/caster.h b/src/apn-python/caster.h new file mode 100644 index 0000000..2dc1989 --- /dev/null +++ b/src/apn-python/caster.h @@ -0,0 +1,74 @@ +#pragma once +#include +#include + +namespace pybind11 { +namespace detail { + template <> + struct type_caster> { + public: + PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("arrow::Array")); + /* Python->C++ */ + bool load(handle src, bool) { + PyObject* array = src.ptr(); + if(!PyObject_HasAttrString(array, "__arrow_c_array__")) + return false;; + + // unpack + value = unpack_array(array); + return true; + } + + /* C++ -> Python) */ + static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { + return pack_array(src); + } + }; + + template <> + struct type_caster> { + public: + PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("arrow::Schema")); + /* Python->C++ */ + bool load(handle src, bool) { + PyObject* schema = src.ptr(); + + // parse arguments + if(!PyObject_HasAttrString(schema, "__arrow_c_schema__")) + return false; + + // unpack + value = unpack_schema(schema); + return true; + } + + /* C++ -> Python) */ + static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { + return pack_schema(src); + } + }; + + // template <> + // struct type_caster> { + // public: + // PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Table")); + // /* Python->C++ */ + // bool load(handle src, bool) { + // PyObject* source = src.ptr(); + // if(!arrow::py::is_table(source)) + // return false; + // arrow::Result> result = arrow::py::unwrap_table(source); + // if(!result.ok()) + // return false; + // value = std::static_pointer_cast(result.ValueOrDie()); + // return true; + // } + + // /* C++ -> Python) */ + // static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { + // return arrow::py::wrap_table(src); + // } + // }; + +} +} diff --git a/src/apn-python/caster.hpp b/src/apn-python/caster.hpp deleted file mode 100644 index c4368d5..0000000 --- a/src/apn-python/caster.hpp +++ /dev/null @@ -1,72 +0,0 @@ -// #pragma once - -// namespace pybind11 { -// namespace detail { -// template <> -// struct type_caster> { -// public: -// PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Array")); -// /* Python->C++ */ -// bool load(handle src, bool) { -// PyObject* source = src.ptr(); -// if(!arrow::py::is_array(source)) -// return false; -// arrow::Result> result = arrow::py::unwrap_array(source); -// if(!result.ok()) -// return false; -// value = std::static_pointer_cast(result.ValueOrDie()); -// return true; -// } - -// /* C++ -> Python) */ -// static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { -// return arrow::py::wrap_array(src); -// } -// }; - -// template <> -// struct type_caster> { -// public: -// PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Schema")); -// /* Python->C++ */ -// bool load(handle src, bool) { -// PyObject* source = src.ptr(); -// if(!arrow::py::is_schema(source)) -// return false; -// arrow::Result> result = arrow::py::unwrap_schema(source); -// if(!result.ok()) -// return false; -// value = std::static_pointer_cast(result.ValueOrDie()); -// return true; -// } - -// /* C++ -> Python) */ -// static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { -// return arrow::py::wrap_schema(src); -// } -// }; - -// template <> -// struct type_caster> { -// public: -// PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Table")); -// /* Python->C++ */ -// bool load(handle src, bool) { -// PyObject* source = src.ptr(); -// if(!arrow::py::is_table(source)) -// return false; -// arrow::Result> result = arrow::py::unwrap_table(source); -// if(!result.ok()) -// return false; -// value = std::static_pointer_cast(result.ValueOrDie()); -// return true; -// } - -// /* C++ -> Python) */ -// static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { -// return arrow::py::wrap_table(src); -// } -// }; - -// } -// } diff --git a/src/apn-python/common.cpp b/src/apn-python/common.cpp new file mode 100644 index 0000000..f7ed87d --- /dev/null +++ b/src/apn-python/common.cpp @@ -0,0 +1,75 @@ +#include "arrow/c/abi.h" +#include +#include +#include + + + +void ReleaseArrowArrayPyCapsule(PyObject* array_capsule) { + // struct ArrowArray* c_array = (struct ArrowArray*)PyCapsule_GetPointer(array_capsule, "arrow_array"); + // if (c_array->release != NULL) { + // c_array->release(c_array); + // } + // free(c_array); +} + +void ReleaseArrowSchemaPyCapsule(PyObject* schema_capsule) { + // struct ArrowSchema* c_schema = (struct ArrowSchema*)PyCapsule_GetPointer(schema_capsule, "arrow_schema"); + // if (c_schema->release != NULL) { + // c_schema->release(c_schema); + // } + // free(c_schema); +} + +std::shared_ptr unpack_array(PyObject* array) { + // call the method and get the tuple + PyObject* array_capsule_tuple = PyObject_CallNoArgs(PyObject_GetAttrString(array, "__arrow_c_array__")); + PyObject* schema_capsule_obj = PyTuple_GetItem(array_capsule_tuple, 0); + PyObject* array_capsule_obj = PyTuple_GetItem(array_capsule_tuple, 1); + + // extract the capsule + struct ArrowArray* c_array = (struct ArrowArray*) PyCapsule_GetPointer(array_capsule_obj, "arrow_array"); + + // Convert C array to C++ array and extract info + std::shared_ptr arrow_array = arrow::ImportArray(c_array, unpack_dtype(schema_capsule_obj)).ValueOrDie(); + return arrow_array; +} + +PyObject* pack_array(std::shared_ptr array) { + // Convert to C api + struct ArrowArray* c_array = (struct ArrowArray*)malloc(sizeof(struct ArrowArray)); + struct ArrowSchema* c_schema = (struct ArrowSchema*)malloc(sizeof(struct ArrowSchema)); + (void)arrow::ExportArray(*array, c_array, c_schema); + + // Hoist out to pycapsule + PyObject* array_capsule = PyCapsule_New(c_array, "arrow_array", ReleaseArrowArrayPyCapsule); + PyObject* schema_capsule = PyCapsule_New(c_schema, "arrow_schema", ReleaseArrowSchemaPyCapsule); + + return PyTuple_Pack(2, schema_capsule, array_capsule); +} + +std::shared_ptr unpack_dtype(PyObject* dtype_capsule) { + // extract the capsule + struct ArrowSchema* c_dtype = (struct ArrowSchema*) PyCapsule_GetPointer(dtype_capsule, "arrow_schema"); + std::shared_ptr arrow_dtype = arrow::ImportType(c_dtype).ValueOrDie(); + return arrow_dtype; +} + +std::shared_ptr unpack_schema(PyObject* schema) { + // extract the capsule + PyObject* schema_capsule = PyObject_CallNoArgs(PyObject_GetAttrString(schema, "__arrow_c_schema__")); + struct ArrowSchema* c_schema = (struct ArrowSchema*) PyCapsule_GetPointer(schema_capsule, "arrow_schema"); + + // Convert C schema to C++ schema and extract info + std::shared_ptr arrow_schema = arrow::ImportSchema(c_schema).ValueOrDie(); + return arrow_schema; +} + +PyObject* pack_schema(std::shared_ptr schema) { + // Convert to C api + struct ArrowSchema* c_schema = (struct ArrowSchema*)malloc(sizeof(struct ArrowSchema)); + (void)arrow::ExportSchema(*schema, c_schema); + + // Hoist out to pycapsule + return PyCapsule_New(c_schema, "arrow_schema", ReleaseArrowSchemaPyCapsule); +} diff --git a/src/apn-python/common.h b/src/apn-python/common.h new file mode 100644 index 0000000..80bc1f6 --- /dev/null +++ b/src/apn-python/common.h @@ -0,0 +1,16 @@ +#pragma once +#include + +#include "Python.h" +#include +#include + +LIB_EXPORT std::shared_ptr unpack_array(PyObject*); +LIB_EXPORT std::shared_ptr unpack_dtype(PyObject*); +LIB_EXPORT PyObject* pack_array(std::shared_ptr); + +LIB_EXPORT std::shared_ptr unpack_schema(PyObject*); +LIB_EXPORT PyObject* pack_schema(std::shared_ptr); + +// LIB_EXPORT char* table_info_py(std::shared_ptr table); +// LIB_EXPORT std::shared_ptr create_table_py(); diff --git a/src/apn-python/cpython.cpp b/src/apn-python/cpython.cpp index 853a3c2..c43c3d5 100644 --- a/src/apn-python/cpython.cpp +++ b/src/apn-python/cpython.cpp @@ -1,118 +1,60 @@ -#include -#include +#include +#include +#include #include -static PyObject* _raise_error(PyObject* module) { - PyErr_SetString(PyExc_TypeError, "Bad value provided"); +static PyObject* _raise_error(PyObject* module, const std::string& msg = "Bad value provided") { + PyErr_SetString(PyExc_TypeError, msg.c_str()); return NULL; } PyObject* array_info_py(PyObject* self, PyObject* args) { - PyObject* source; + PyObject* array; - if(!PyArg_ParseTuple(args, "O", &source)) - return _raise_error(self); + if(!PyArg_ParseTuple(args, "O", &array)) + return _raise_error(self, "Bad arguments to array_info"); - // Old non-pycapsule way, not ABI stable - // and relies on pyarrow - // if(!arrow::py::is_array(source)) - // return _raise_error(self); - // arrow::Result> result = arrow::py::unwrap_array(source); - // if(!result.ok()) - // return _raise_error(self); - // char* ret_str = array_info_py(std::static_pointer_cast(result.ValueOrDie())); + if(!PyObject_HasAttrString(array, "__arrow_c_array__")) + return _raise_error(self, "Argument to array_info not an array"); - // New pycapsule way - if(!PyObject_HasAttrString(source, "__arrow_c_array__")) - return _raise_error(self); - - // extract the capsule - PyObject* array_capsule = PyObject_CallNoArgs(PyObject_GetAttrString(source, "__arrow_c_array__")); - struct ArrowArray* c_array = (struct ArrowArray*) PyCapsule_GetPointer(array_capsule, "arrow_array"); - - // Convert C array to C++ array and extract info - // TODO hardcoding datatype here, you would want to also pass in schema in real world - std::shared_ptr arrow_array = arrow::ImportArray(c_array, arrow::int32()).ValueOrDie(); + // unpack + std::shared_ptr arrow_array = unpack_array(array); // Get info and return std::string info = array_info(arrow_array); return PyUnicode_FromStringAndSize(info.c_str(), info.length()); } -void ReleaseArrowArrayPyCapsule(PyObject* array_capsule) { - struct ArrowArray* c_array = (struct ArrowArray*)PyCapsule_GetPointer(array_capsule, "arrow_array"); - // if (c_array->release != NULL) { - // c_array->release(c_array); - // } - // free(c_array); -} PyObject* create_array_py(PyObject* self, PyObject* args) { - // Old non-pycapsule way - // return arrow::py::wrap_array(create_array_py()); - // Create array with arrow C++ std::shared_ptr array = create_array('a'); - - // Convert to C api - struct ArrowArray *c_array = (struct ArrowArray*)malloc(sizeof(struct ArrowArray)); - (void)arrow::ExportArray(*array, c_array); - - // Hoist out to pycapsule - return PyCapsule_New(c_array, "arrow_array", ReleaseArrowArrayPyCapsule); + return pack_array(array); } PyObject* schema_info_py(PyObject* self, PyObject* args) { - PyObject* source; + PyObject* schema; // parse arguments - if(!PyArg_ParseTuple(args, "O", &source)) - return _raise_error(self); - - // Old non-pycapsule way, not ABI stable - // and relies on pyarrow - // if(!arrow::py::is_schema(source)) - // return _raise_error(self); - // if(!result.ok()) - // return _raise_error(self); - // arrow::Result> result = arrow::py::unwrap_schema(source); - // char* ret_str = schema_info_py(std::static_pointer_cast(result.ValueOrDie())); - - // New pycapsule way - if(!PyObject_HasAttrString(source, "__arrow_c_schema__")) - return _raise_error(self); - - // extract the capsule - PyObject* schema_capsule = PyObject_CallNoArgs(PyObject_GetAttrString(source, "__arrow_c_schema__")); - struct ArrowSchema* c_schema = (struct ArrowSchema*) PyCapsule_GetPointer(schema_capsule, "arrow_schema"); - - // Convert C schema to C++ schema and extract info - std::shared_ptr arrow_schema = arrow::ImportSchema(c_schema).ValueOrDie(); + if(!PyArg_ParseTuple(args, "O", &schema)) + return _raise_error(self, "Bad arguments to schema_info"); + + if(!PyObject_HasAttrString(schema, "__arrow_c_schema__")) + return _raise_error(self, "First argument to schema_info not a schema"); + + // unpack + std::shared_ptr arrow_schema = unpack_schema(schema); + + // Get info and return std::string info = schema_info(arrow_schema); return PyUnicode_FromStringAndSize(info.c_str(), info.length()); } -void ReleaseArrowSchemaPyCapsule(PyObject* schema_capsule) { - struct ArrowSchema* c_schema = (struct ArrowSchema*)PyCapsule_GetPointer(schema_capsule, "arrow_schema"); - if (c_schema->release != NULL) { - c_schema->release(c_schema); - } - // free(c_schema); -} PyObject* create_schema_py(PyObject* self, PyObject* Py_UNUSED(args)) { - // Old non-pycapsule way - // return arrow::py::wrap_schema(create_schema_py()); - // Create array with arrow C++ std::shared_ptr schema = create_schema(); - - // Convert to C api - struct ArrowSchema* c_schema = (struct ArrowSchema*)malloc(sizeof(struct ArrowSchema)); - (void)arrow::ExportSchema(*schema, c_schema); - - // Hoist out to pycapsule - return PyCapsule_New(c_schema, "arrow_schema", ReleaseArrowSchemaPyCapsule); + return pack_schema(schema); } // std::string table_info_py_raw(PyObject* source) { diff --git a/src/apn-python/cpython.hpp b/src/apn-python/cpython.h similarity index 100% rename from src/apn-python/cpython.hpp rename to src/apn-python/cpython.h diff --git a/src/apn-python/pybind11.cpp b/src/apn-python/pybind11.cpp index 564996e..950da45 100644 --- a/src/apn-python/pybind11.cpp +++ b/src/apn-python/pybind11.cpp @@ -1,2 +1,31 @@ -// #include +#include +#include +#include +#include +std::string array_info_py(std::shared_ptr arrow_array) { + return array_info(arrow_array); +} + +std::shared_ptr create_array_py() { + return create_array('a');; +} + +std::string schema_info_py(std::shared_ptr arrow_schema) { + return schema_info(arrow_schema); +} + +std::shared_ptr create_schema_py() { + return create_schema(); +} + +// std::string table_info_py(std::shared_ptr table) { +// return table_info(table); +// } + +// std::shared_ptr create_table_py() { +// std::shared_ptr arrow_table = create_table(); +// return arrow_table; +// // PyObject* obj = arrow::py::wrap_table(arrow_table); +// // return pybind11::cast(obj); +// } \ No newline at end of file diff --git a/src/apn-python/pybind11.h b/src/apn-python/pybind11.h new file mode 100644 index 0000000..02f500a --- /dev/null +++ b/src/apn-python/pybind11.h @@ -0,0 +1,25 @@ +#pragma once +#include +#include +#include + +namespace py = pybind11; + +LIB_EXPORT std::string array_info_py(std::shared_ptr); +LIB_EXPORT std::shared_ptr create_array_py(); + +LIB_EXPORT std::string schema_info_py(std::shared_ptr); +LIB_EXPORT std::shared_ptr create_schema_py(); + +// LIB_EXPORT std::string table_info_py(std::shared_ptr); +// LIB_EXPORT std::shared_ptr create_table_py(); + +PYBIND11_MODULE(pybind11extension, m) { + m.doc() = "pybind11"; + m.def("array_info", &array_info_py); + m.def("create_array", &create_array_py); + m.def("schema_info", &schema_info_py); + m.def("create_schema", &create_schema_py); + // m.def("table_info", &table_info_py, ""); + // m.def("create_table", &create_table_py, ""); +} diff --git a/src/apn-python/pybind11.hpp b/src/apn-python/pybind11.hpp deleted file mode 100644 index a625dfb..0000000 --- a/src/apn-python/pybind11.hpp +++ /dev/null @@ -1,17 +0,0 @@ -// #pragma once -// #include -// #include -// #include - -// namespace py = pybind11; - -// // LIB_EXPORT PyObject* table_info_py_raw(PyObject*, PyObject*); -// PYBIND11_MODULE(pybind11extension, m) { -// m.doc() = "pybind11"; -// m.def("array_info", &array_info_py); -// m.def("create_array", &create_array_py); -// m.def("schema_info", &schema_info_py); -// m.def("create_schema", &create_schema_py); -// // m.def("table_info", &table_info_py, ""); -// // m.def("create_table", &create_table_py, ""); -// } diff --git a/src/apn/apn.cpp b/src/apn/apn.cpp index d5c713a..5b973ee 100644 --- a/src/apn/apn.cpp +++ b/src/apn/apn.cpp @@ -1,5 +1,5 @@ #include -#include "apn/apn.hpp" +#include "apn/apn.h" std::shared_ptr create_array(char variant) { if(variant == 'b') { diff --git a/src/apn/apn.hpp b/src/apn/apn.h similarity index 100% rename from src/apn/apn.hpp rename to src/apn/apn.h diff --git a/src/apn/bridge.cpp b/src/apn/bridge.cpp index d7afb96..52140fc 100644 --- a/src/apn/bridge.cpp +++ b/src/apn/bridge.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include int array_info_cabi(struct ArrowArray* c_array, char* buffer, size_t size) { std::shared_ptr arrow_array = arrow::ImportArray(c_array, arrow::int32()).ValueOrDie(); From f5e8156518026e8ed0425f8ec6052c17267a4611 Mon Sep 17 00:00:00 2001 From: Tim Paine <3105306+timkpaine@users.noreply.github.com> Date: Fri, 23 Feb 2024 17:21:45 -0500 Subject: [PATCH 3/4] bump checkout to 4 --- .github/workflows/build.yml | 44 ++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9fd3924..b9b1277 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -4,7 +4,7 @@ on: push: branches: - main - - tkp/linux + - tkp/linux tags: - v* paths-ignore: @@ -52,11 +52,11 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: # for pull_request so we can do HEAD^2 fetch-depth: 2 - + - name: Initialize variables uses: actions-ext/general/initialize-variables@v1 id: initialize @@ -91,7 +91,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true @@ -109,7 +109,7 @@ jobs: # - name: Install dependencies # run: make develop-cpp - + - name: Install dependencies run: make develop-arrow-ubuntu if: ${{ runner.os == 'Linux' }} @@ -167,7 +167,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Run cibuildwheel uses: actions-ext/python/run-cibuildwheel@v1 @@ -215,7 +215,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python uses: actions-ext/python/setup@v1 @@ -228,14 +228,14 @@ jobs: # - name: Install dependencies # run: make develop-cpp - + - name: Install dependencies run: make develop-arrow-ubuntu if: ${{ runner.os == 'Linux' }} - name: Install dependencies run: make develop-py - + - name: Run cibuildwheel uses: actions-ext/python/run-cibuildwheel@v1 with: @@ -270,10 +270,10 @@ jobs: - '3.9' runs-on: ${{ matrix.os }} if: ${{ needs.initialize.outputs.FULL_BUILD == 'true' }} - + steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python uses: actions-ext/python/setup@v1 @@ -286,14 +286,14 @@ jobs: # - name: Install dependencies # run: make develop-cpp - + - name: Install dependencies run: make develop-arrow-ubuntu if: ${{ runner.os == 'Linux' }} - name: Install dependencies run: make develop-py - + - name: Build sdist run: make dist-sdist @@ -328,7 +328,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python uses: actions-ext/python/setup@v1 @@ -380,7 +380,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python uses: actions-ext/python/setup@v1 @@ -393,14 +393,14 @@ jobs: # - name: Install dependencies # run: make develop-cpp - + - name: Install dependencies run: make develop-arrow-ubuntu if: ${{ runner.os == 'Linux' }} - name: Install dependencies run: make develop-py - + - name: Download wheels uses: actions-ext/python/download-dist@v1 with: @@ -409,7 +409,7 @@ jobs: - name: Test project run: make tests-ci - + - name: Upload test results uses: EnricoMi/publish-unit-test-result-action@v2 with: @@ -457,7 +457,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python uses: actions-ext/python/setup@v1 @@ -470,14 +470,14 @@ jobs: # - name: Install dependencies # run: make develop-cpp - + - name: Install dependencies run: make develop-arrow-ubuntu if: ${{ runner.os == 'Linux' }} - name: Install dependencies run: make develop-py - + - name: Download wheels uses: actions-ext/python/download-dist@v1 with: @@ -485,7 +485,7 @@ jobs: - name: Test project run: make tests-ci - + - name: Upload test results uses: EnricoMi/publish-unit-test-result-action@v2 with: From 14340ea8dec80601805fd50204d9eecb7b2b026e Mon Sep 17 00:00:00 2001 From: Tim Paine <3105306+timkpaine@users.noreply.github.com> Date: Sat, 24 Feb 2024 00:47:25 +0000 Subject: [PATCH 4/4] fix linux build by removing CXXABI flag from older arrow --- CMakeLists.txt | 8 ++++---- arrow_python_nocopy/__init__.py | 3 --- pyproject.toml | 1 - src/apn-python/common.h | 1 + vcpkg | 2 +- 5 files changed, 6 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4916f39..cb5c0ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,7 +121,7 @@ if(NOT WIN32) if(MACOS) set(module_origin_path "@loader_path") else() - set(module_origin_path "\$ORIGIN") + set(module_origin_path "\$ORIGIN") endif() else() set(CMAKE_SHARED_LIBRARY_PREFIX "lib") @@ -132,7 +132,7 @@ endif() # Flags # ######### set(CMAKE_POSITION_INDEPENDENT_CODE On) -add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) +# add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) # Compiler version flags if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") @@ -203,7 +203,7 @@ endif() ############################################################################################################### # Messages # ############ -message("Building CPP Template version v${ARROW_PYTHON_NOCOPY_VERSION_MAJOR}.${ARROW_PYTHON_NOCOPY_VERSION_MINOR}.${ARROW_PYTHON_NOCOPY_VERSION_PATCH} [${ARROW_PYTHON_NOCOPY_VERSION_COMMIT_SHA}]") +message("Building Arrow-Python-Nocopy version v${ARROW_PYTHON_NOCOPY_VERSION_MAJOR}.${ARROW_PYTHON_NOCOPY_VERSION_MINOR}.${ARROW_PYTHON_NOCOPY_VERSION_PATCH} [${ARROW_PYTHON_NOCOPY_VERSION_COMMIT_SHA}]") string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWER) if(CMAKE_BUILD_TYPE_LOWER STREQUAL debug) @@ -235,7 +235,7 @@ set( ) add_library(arrow-python-nocopy SHARED ${PROJECT_SRCS}) -target_link_libraries(arrow-python-nocopy PRIVATE ${Arrow_LIBRARY}) +target_link_libraries(arrow-python-nocopy PRIVATE Arrow::arrow_static) set_target_properties(arrow-python-nocopy PROPERTIES PUBLIC_HEADER "${PROJECT_HDRS}") # export symbols diff --git a/arrow_python_nocopy/__init__.py b/arrow_python_nocopy/__init__.py index 21ad566..1ff1346 100644 --- a/arrow_python_nocopy/__init__.py +++ b/arrow_python_nocopy/__init__.py @@ -4,11 +4,8 @@ import pandas as pd from .lib.pybind11extension import array_info, create_array from .lib.pybind11extension import schema_info, create_schema -# from .lib.pybind11extension import table_info, create_table - from .lib.cpythonextension import array_info as array_info_cp, create_array as create_array_cp from .lib.cpythonextension import schema_info as schema_info_cp, create_schema as create_schema_cp -# from .lib.cpythonextension import table_info as table_info_cp, create_table as create_table_cp __version__ = "0.1.0" diff --git a/pyproject.toml b/pyproject.toml index 60fcee9..33108cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,7 +107,6 @@ line-length = 170 "__init__.py" = ["F401", "F403"] [tool.pytest.ini_options] -asyncio_mode = "strict" testpaths = "arrow_python_nocopy/tests" [tool.setuptools] diff --git a/src/apn-python/common.h b/src/apn-python/common.h index 80bc1f6..4f3944d 100644 --- a/src/apn-python/common.h +++ b/src/apn-python/common.h @@ -4,6 +4,7 @@ #include "Python.h" #include #include +#include LIB_EXPORT std::shared_ptr unpack_array(PyObject*); LIB_EXPORT std::shared_ptr unpack_dtype(PyObject*); diff --git a/vcpkg b/vcpkg index 08f00b4..69baa84 160000 --- a/vcpkg +++ b/vcpkg @@ -1 +1 @@ -Subproject commit 08f00b4f017129450e1e04ad943b77a974603645 +Subproject commit 69baa842721d9083bb8278ca85c73dd328b1ebdc