Skip to content

Commit

Permalink
Add Python impl
Browse files Browse the repository at this point in the history
  • Loading branch information
lidavidm committed Jun 30, 2024
1 parent 469c9a4 commit 1456a3b
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 2 deletions.
5 changes: 3 additions & 2 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ def print_entry(label, value):
dictionary,
run_end_encoded,
fixed_shape_tensor,
opaque,
field,
type_for_alias,
DataType, DictionaryType, StructType,
Expand All @@ -182,7 +183,7 @@ def print_entry(label, value):
TimestampType, Time32Type, Time64Type, DurationType,
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
RunEndEncodedType, FixedShapeTensorType,
RunEndEncodedType, FixedShapeTensorType, OpaqueType,
PyExtensionType, UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
Expand Down Expand Up @@ -216,7 +217,7 @@ def print_entry(label, value):
Time32Array, Time64Array, DurationArray,
MonthDayNanoIntervalArray,
Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
RunEndEncodedArray, FixedShapeTensorArray,
RunEndEncodedArray, FixedShapeTensorArray, OpaqueArray,
scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
Expand Down
28 changes: 28 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -4438,6 +4438,34 @@ cdef class FixedShapeTensorArray(ExtensionArray):
)


cdef class OpaqueArray(ExtensionArray):
"""
Concrete class for opaque extension arrays.
Examples
--------
Define the extension type for opaque array
>>> import pyarrow as pa
>>> opaque_type = pa.opaque(
... pa.binary(),
... type_name="geometry",
... vendor_name="postgis",
... )
Create an extension array
>>> arr = [None, b"data"]
>>> storage = pa.array(arr, pa.binary())
>>> pa.ExtensionArray.from_storage(opaque_type, storage)
<pyarrow.lib.OpaqueArray object at ...>
[
null,
64617461
]
"""


cdef dict _array_classes = {
_Type_NA: NullArray,
_Type_BOOL: BooleanArray,
Expand Down
13 changes: 13 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2882,6 +2882,19 @@ cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extens
" arrow::extension::FixedShapeTensorArray"(CExtensionArray):
const CResult[shared_ptr[CTensor]] ToTensor() const


cdef extern from "arrow/extension/opaque.h" namespace "arrow::extension" nogil:
cdef cppclass COpaqueType \
" arrow::extension::OpaqueType"(CExtensionType):

c_string type_name()
c_string vendor_name()

cdef cppclass COpaqueArray \
" arrow::extension::OpaqueArray"(CExtensionArray):
pass


cdef extern from "arrow/util/compression.h" namespace "arrow" nogil:
cdef enum CCompressionType" arrow::Compression::type":
CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED"
Expand Down
5 changes: 5 additions & 0 deletions python/pyarrow/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ cdef class FixedShapeTensorType(BaseExtensionType):
const CFixedShapeTensorType* tensor_ext_type


cdef class OpaqueType(BaseExtensionType):
cdef:
const COpaqueType* opaque_ext_type


cdef class PyExtensionType(ExtensionType):
pass

Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/public-api.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ cdef api object pyarrow_wrap_data_type(
return cpy_ext_type.GetInstance()
elif ext_type.extension_name() == b"arrow.fixed_shape_tensor":
out = FixedShapeTensorType.__new__(FixedShapeTensorType)
elif ext_type.extension_name() == b"arrow.opaque":
out = OpaqueType.__new__(OpaqueType)
else:
out = BaseExtensionType.__new__(BaseExtensionType)
else:
Expand Down
6 changes: 6 additions & 0 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,12 @@ cdef class FixedShapeTensorScalar(ExtensionScalar):
return pyarrow_wrap_tensor(ctensor)


cdef class OpaqueScalar(ExtensionScalar):
"""
Concrete class for opaque extension scalar.
"""


cdef dict _scalar_classes = {
_Type_BOOL: BooleanScalar,
_Type_UINT8: UInt8Scalar,
Expand Down
101 changes: 101 additions & 0 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1810,6 +1810,50 @@ cdef class FixedShapeTensorType(BaseExtensionType):
return FixedShapeTensorScalar


cdef class OpaqueType(BaseExtensionType):
"""
Concrete class for opaque extension type.
Opaque is a placeholder for a type from an external (often non-Arrow)
system that could not be interpreted.
Examples
--------
Create an instance of opaque extension type:
>>> import pyarrow as pa
>>> pa.opaque(pa.int32(), "geometry", "postgis")
OpaqueType(extension<arrow.opaque[storage_type=int32, type_name=geometry, vendor_name=postgis]>)
"""

cdef void init(self, const shared_ptr[CDataType]& type) except *:
BaseExtensionType.init(self, type)
self.opaque_ext_type = <const COpaqueType*> type.get()

@property
def type_name(self):
"""
The name of the type in the external system.
"""
return frombytes(c_string(self.opaque_ext_type.type_name()))

@property
def vendor_name(self):
"""
The name of the external system.
"""
return frombytes(c_string(self.opaque_ext_type.vendor_name()))

def __arrow_ext_class__(self):
return OpaqueArray

def __reduce__(self):
return opaque, (self.storage_type, self.type_name, self.vendor_name)

def __arrow_ext_scalar_class__(self):
return OpaqueScalar


_py_extension_type_auto_load = False


Expand Down Expand Up @@ -5207,6 +5251,63 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N
return out


def opaque(DataType storage_type, str type_name not None, str vendor_name not None):
"""
Create instance of opaque extension type.
Parameters
----------
storage_type : DataType
The underlying data type.
type_name : str
The name of the type in the external system.
vendor_name : str
The name of the external system.
Examples
--------
Create an instance of opaque extension type:
>>> import pyarrow as pa
>>> type = pa.opaque(pa.binary(), "other", "jdbc")
>>> type
OpaqueType(extension<arrow.opaque[storage_type=binary, type_name=other, vendor_name=jdbc]>)
Inspect the data type:
>>> type.storage_type
DataType(binary)
>>> type.type_name
'other'
>>> type.vendor_name
'jdbc'
Create a table with fixed shape tensor extension array:
>>> arr = [None, b"foobar"]
>>> storage = pa.array(arr, pa.binary())
>>> other = pa.ExtensionArray.from_storage(type, storage)
>>> pa.table([other], names=["unknown_col"])
pyarrow.Table
unknown_col: extension<arrow.opaque[storage_type=binary, type_name=other, vendor_name=jdbc]>
----
unknown_col: [[null,666F6F626172]]
Returns
-------
type : OpaqueType
"""

cdef:
c_string c_type_name = tobytes(type_name)
c_string c_vendor_name = tobytes(vendor_name)
shared_ptr[CDataType] c_type = make_shared[COpaqueType](
storage_type.sp_type, c_type_name, c_vendor_name)
OpaqueType out = OpaqueType.__new__(OpaqueType)
out.init(c_type)
return out


cdef dict _type_aliases = {
'null': null,
'bool': bool_,
Expand Down

0 comments on commit 1456a3b

Please sign in to comment.