From 4db2ca0743b2029f0c3c1049cdc5a9c453f55b15 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Tue, 27 Aug 2024 13:42:37 +0000 Subject: [PATCH 001/130] UuidType -> RationalType in the docs In a previous version of the docs, a UuidType was discussed as an extension type. However, this type has been promoted to a canonical type, and so no longer is a good example of an extension type a user may wish to create. We replace UuidType in the docs with a RationalType --- docs/source/format/Integration.rst | 31 +++-- docs/source/python/extending_types.rst | 86 ++++++++----- python/pyarrow/types.pxi | 166 ++++++++++++++++--------- 3 files changed, 182 insertions(+), 101 deletions(-) diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index 0ab5b832ad012..b4e50a54b07d1 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -390,20 +390,37 @@ but can be of any type. Extension types are, as in the IPC format, represented as their underlying storage type plus some dedicated field metadata to reconstruct the extension -type. For example, assuming a "uuid" extension type backed by a -FixedSizeBinary(16) storage, here is how a "uuid" field would be represented:: +type. For example, assuming a "rational" extension type backed by a +``struct`` storage, here is how a "rational" field +would be represented:: { "name" : "name_of_the_field", "nullable" : /* boolean */, "type" : { - "name" : "fixedsizebinary", - "byteWidth" : 16 + "name" : "struct", }, - "children" : [], + "children" : [ + { + "name": "numer", + "type": { + "name": "int", + "bitWidth": 32, + "isSigned": true + }, + }, + { + "name": "denom", + "type": { + "name": "int", + "bitWidth": 32, + "isSigned": true + } + }, + ], "metadata" : [ - {"key": "ARROW:extension:name", "value": "uuid"}, - {"key": "ARROW:extension:metadata", "value": "uuid-serialized"} + {"key": "ARROW:extension:name", "value": "rational"}, + {"key": "ARROW:extension:metadata", "value": "rational-serialized"} ] } diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index d746505348157..67cb3425dc99c 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -131,50 +131,72 @@ and serialization mechanism. The extension name and serialized metadata can potentially be recognized by other (non-Python) Arrow implementations such as PySpark. -For example, we could define a custom UUID type for 128-bit numbers which can -be represented as ``FixedSizeBinary`` type with 16 bytes:: +For example, we could define a custom rational type for fractions which can +be represented as a pair of integers:: - class UuidType(pa.ExtensionType): + import pyarrow.types as pt + + class RationalType(pa.ExtensionType): def __init__(self): - super().__init__(pa.binary(16), "my_package.uuid") - def __arrow_ext_serialize__(self): - # Since we don't have a parameterized type, we don't need extra - # metadata to be deserialized - return b'' + super().__init__( + pa.struct( + [ + ("numer", pa.int64()), + ("denom", pa.int64()), + ], + ), + "my_package.rational", + ) + + def __arrow_ext_serialize__(self) -> bytes: + # No serialized metadata necessary + return b"" @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): + def __arrow_ext_deserialize__(self, storage_type, serialized): # Sanity checks, not required but illustrate the method signature. - assert storage_type == pa.binary(16) + assert pt.is_int32(storage_type) assert serialized == b'' - # Return an instance of this subclass given the serialized - # metadata. - return UuidType() + + # return an instance of this subclass given the serialized + # metadata + return RationalType() + The special methods ``__arrow_ext_serialize__`` and ``__arrow_ext_deserialize__`` -define the serialization of an extension type instance. For non-parametric -types such as the above, the serialization payload can be left empty. +define the serialization of an extension type instance. This can now be used to create arrays and tables holding the extension type:: - >>> uuid_type = UuidType() - >>> uuid_type.extension_name - 'my_package.uuid' - >>> uuid_type.storage_type - FixedSizeBinaryType(fixed_size_binary[16]) - - >>> import uuid - >>> storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], pa.binary(16)) - >>> arr = pa.ExtensionArray.from_storage(uuid_type, storage_array) + >>> rational_type = RationalType() + >>> rational_type.extension_name + 'my_package.rational' + >>> rational_type.storage_type + StructType(struct) + + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type + ... ) + >>> arr = rational_type.wrap_array(storage_array) + >>> arr = pa.ExtensionArray.from_storage(rational_type, storage_array) >>> arr - + + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 [ - A6861959108644B797664AEEE686B682, - 718747F48E5F4058A7261E2B6B228BE8, - 7FE201227D624D96A5CD8639DEF2A68B, - C6CA8C7F95744BFD9462A40B3F57A86C + 17, + 13 ] This array can be included in RecordBatches, sent over IPC and received in @@ -182,7 +204,7 @@ another Python process. The receiving process must explicitly register the extension type for deserialization, otherwise it will fall back to the storage type:: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType()) For example, creating a RecordBatch and writing it to a stream using the IPC protocol:: @@ -198,10 +220,10 @@ and then reading it back yields the proper type:: >>> with pa.ipc.open_stream(buf) as reader: ... result = reader.read_all() >>> result.column('ext').type - UuidType(FixedSizeBinaryType(fixed_size_binary[16])) + RationalType(StructType(struct)) The receiving application doesn't need to be Python but can still recognize -the extension type as a "my_package.uuid" type, if it has implemented its own +the extension type as a "my_package.rational" type, if it has implemented its own extension type to receive it. If the type is not registered in the receiving application, it will fall back to the storage type. diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 563782f0c2643..1cb929eac6bf2 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1618,59 +1618,79 @@ cdef class ExtensionType(BaseExtensionType): Examples -------- - Define a UuidType extension type subclassing ExtensionType: + Define a RationalType extension type subclassing ExtensionType: >>> import pyarrow as pa - >>> class UuidType(pa.ExtensionType): - ... def __init__(self): - ... pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") - ... def __arrow_ext_serialize__(self): - ... # since we don't have a parameterized type, we don't need extra - ... # metadata to be deserialized - ... return b'' - ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata. - ... return UuidType() - ... + >>> import pyarrow.types as pt + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pt.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No serialized metadata necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... # return an instance of this subclass given the serialized + ... # metadata + ... return RationalType(storage_type[0].type) Register the extension type: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType(pa.int64())) - Create an instance of UuidType extension type: + Create an instance of RationalType extension type: - >>> uuid_type = UuidType() + >>> rational_type = RationalType(pa.int32()) Inspect the extension type: - >>> uuid_type.extension_name - 'my_package.uuid' - >>> uuid_type.storage_type - FixedSizeBinaryType(fixed_size_binary[16]) + >>> rational_type.extension_name + 'my_package.rational' + >>> rational_type.storage_type + StructType(struct) Wrap an array as an extension array: - >>> import uuid - >>> storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], pa.binary(16)) - >>> uuid_type.wrap_array(storage_array) + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type + ... ) + >>> ratoinal_type.wrap_array(storage_array) - [ + -- is_valid: all not null ... - ] Or do the same with creating an ExtensionArray: - >>> pa.ExtensionArray.from_storage(uuid_type, storage_array) + >>> pa.ExtensionArray.from_storage(rational_type, storage_array) - [ + -- is_valid: all not null ... - ] Unregister the extension type: - >>> pa.unregister_extension_type("my_package.uuid") + >>> pa.unregister_extension_type("my_package.rational") + + Note that even though we registered the concrete type + ``RationalType(pa.int64())``, pyarrow will be able to deserialize + ``RationalType(integer_type)`` for any ``integer_type`` as the deserializer + will reference the name ``my_package.rational`` and the ``@classmethod`` + ``__arrow_ext_deserialize__``. """ def __cinit__(self): @@ -2039,30 +2059,41 @@ def register_extension_type(ext_type): Examples -------- - Define a UuidType extension type subclassing ExtensionType: + Define a RationalType extension type subclassing ExtensionType: >>> import pyarrow as pa - >>> class UuidType(pa.ExtensionType): - ... def __init__(self): - ... pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") - ... def __arrow_ext_serialize__(self): - ... # since we don't have a parameterized type, we don't need extra - ... # metadata to be deserialized - ... return b'' - ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata. - ... return UuidType() - ... + >>> import pyarrow.types as pt + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pt.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No serialized metadata necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... # return an instance of this subclass given the serialized + ... # metadata + ... return RationalType(storage_type[0].type) Register the extension type: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType(pa.int64())) Unregister the extension type: - >>> pa.unregister_extension_type("my_package.uuid") + >>> pa.unregister_extension_type("my_package.rational") """ cdef: DataType _type = ensure_type(ext_type, allow_none=False) @@ -2089,30 +2120,41 @@ def unregister_extension_type(type_name): Examples -------- - Define a UuidType extension type subclassing ExtensionType: + Define a RationalType extension type subclassing ExtensionType: >>> import pyarrow as pa - >>> class UuidType(pa.ExtensionType): - ... def __init__(self): - ... pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") - ... def __arrow_ext_serialize__(self): - ... # since we don't have a parameterized type, we don't need extra - ... # metadata to be deserialized - ... return b'' - ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata. - ... return UuidType() - ... + >>> import pyarrow.types as pt + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pt.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No serialized metadata necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... # return an instance of this subclass given the serialized + ... # metadata + ... return RationalType(storage_type[0].type) Register the extension type: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType(pa.int64())) Unregister the extension type: - >>> pa.unregister_extension_type("my_package.uuid") + >>> pa.unregister_extension_type("my_package.rational") """ cdef: c_string c_type_name = tobytes(type_name) From 4d95f227dd66bd5beab8ea292579bca9e55a4b34 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Tue, 27 Aug 2024 13:47:34 +0000 Subject: [PATCH 002/130] fix json formatting --- docs/source/format/Integration.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index b4e50a54b07d1..f76aa0fefcf27 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -398,7 +398,7 @@ would be represented:: "name" : "name_of_the_field", "nullable" : /* boolean */, "type" : { - "name" : "struct", + "name" : "struct" }, "children" : [ { @@ -407,7 +407,7 @@ would be represented:: "name": "int", "bitWidth": 32, "isSigned": true - }, + } }, { "name": "denom", @@ -416,7 +416,7 @@ would be represented:: "bitWidth": 32, "isSigned": true } - }, + } ], "metadata" : [ {"key": "ARROW:extension:name", "value": "rational"}, From fde32158fa5f9a5f943da5aacd2bd8056b7a5bb1 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Tue, 27 Aug 2024 14:06:25 +0000 Subject: [PATCH 003/130] fix some typos --- docs/source/python/extending_types.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 67cb3425dc99c..a881957ccce4a 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -134,6 +134,7 @@ such as PySpark. For example, we could define a custom rational type for fractions which can be represented as a pair of integers:: + import pyarrow as pa import pyarrow.types as pt class RationalType(pa.ExtensionType): @@ -143,8 +144,8 @@ be represented as a pair of integers:: super().__init__( pa.struct( [ - ("numer", pa.int64()), - ("denom", pa.int64()), + ("numer", pa.int32()), + ("denom", pa.int32()), ], ), "my_package.rational", @@ -157,7 +158,8 @@ be represented as a pair of integers:: @classmethod def __arrow_ext_deserialize__(self, storage_type, serialized): # Sanity checks, not required but illustrate the method signature. - assert pt.is_int32(storage_type) + assert pt.is_struct(storage_type) + assert pt.is_int32(storage_type[0].type) assert serialized == b'' # return an instance of this subclass given the serialized From d875d2cd3753ba05727ad3223a491c21383f23c8 Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Tue, 27 Aug 2024 20:34:33 -0400 Subject: [PATCH 004/130] Update docs/source/python/extending_types.rst Co-authored-by: Ian Cook --- docs/source/python/extending_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index a881957ccce4a..af8e9b5cc636c 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -156,7 +156,7 @@ be represented as a pair of integers:: return b"" @classmethod - def __arrow_ext_deserialize__(self, storage_type, serialized): + def __arrow_ext_deserialize__(cls, storage_type, serialized): # Sanity checks, not required but illustrate the method signature. assert pt.is_struct(storage_type) assert pt.is_int32(storage_type[0].type) From eb690554a2d0bd968de977c8ca6360559d36a4e8 Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Tue, 27 Aug 2024 20:35:04 -0400 Subject: [PATCH 005/130] Update docs/source/python/extending_types.rst Co-authored-by: Ian Cook --- docs/source/python/extending_types.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index af8e9b5cc636c..95d686bfe0d19 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -160,6 +160,7 @@ be represented as a pair of integers:: # Sanity checks, not required but illustrate the method signature. assert pt.is_struct(storage_type) assert pt.is_int32(storage_type[0].type) + assert pt.is_int32(storage_type[1].type) assert serialized == b'' # return an instance of this subclass given the serialized From 3fc68428545c325fa34ae52bfd9c1d430090ab9a Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Wed, 28 Aug 2024 01:03:00 +0000 Subject: [PATCH 006/130] response to ianmcook --- docs/source/python/extending_types.rst | 20 +++++++++----------- python/pyarrow/types.pxi | 20 ++++++++------------ 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 95d686bfe0d19..2f1968deb09c3 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -134,9 +134,6 @@ such as PySpark. For example, we could define a custom rational type for fractions which can be represented as a pair of integers:: - import pyarrow as pa - import pyarrow.types as pt - class RationalType(pa.ExtensionType): def __init__(self): @@ -158,9 +155,9 @@ be represented as a pair of integers:: @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): # Sanity checks, not required but illustrate the method signature. - assert pt.is_struct(storage_type) - assert pt.is_int32(storage_type[0].type) - assert pt.is_int32(storage_type[1].type) + assert pa.types.is_struct(storage_type) + assert pa.types.is_int32(storage_type[0].type) + assert pa.types.is_int32(storage_type[1].type) assert serialized == b'' # return an instance of this subclass given the serialized @@ -180,13 +177,14 @@ This can now be used to create arrays and tables holding the extension type:: StructType(struct) >>> storage_array = pa.array( - ... [ - ... {"numer": 10, "denom": 17}, - ... {"numer": 20, "denom": 13}, - ... ], - ... type=rational_type.storage_type + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type, ... ) >>> arr = rational_type.wrap_array(storage_array) + >>> # or equivalently >>> arr = pa.ExtensionArray.from_storage(rational_type, storage_array) >>> arr diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 1cb929eac6bf2..3c3c63f04c8e0 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1620,8 +1620,6 @@ cdef class ExtensionType(BaseExtensionType): -------- Define a RationalType extension type subclassing ExtensionType: - >>> import pyarrow as pa - >>> import pyarrow.types as pt >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pt.is_integer(data_type): @@ -1641,7 +1639,7 @@ cdef class ExtensionType(BaseExtensionType): ... # No serialized metadata necessary ... return b"" ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): ... # return an instance of this subclass given the serialized ... # metadata ... return RationalType(storage_type[0].type) @@ -1670,14 +1668,16 @@ cdef class ExtensionType(BaseExtensionType): ... ], ... type=rational_type.storage_type ... ) - >>> ratoinal_type.wrap_array(storage_array) + >>> rational_array = rational_type.wrap_array(storage_array) + >>> rational_array -- is_valid: all not null ... Or do the same with creating an ExtensionArray: - >>> pa.ExtensionArray.from_storage(rational_type, storage_array) + >>> rational_array = pa.ExtensionArray.from_storage(rational_type, storage_array) + >>> rational_array -- is_valid: all not null ... @@ -1750,7 +1750,7 @@ cdef class ExtensionType(BaseExtensionType): return NotImplementedError @classmethod - def __arrow_ext_deserialize__(self, storage_type, serialized): + def __arrow_ext_deserialize__(cls, storage_type, serialized): """ Return an extension type instance from the storage type and serialized metadata. @@ -2061,8 +2061,6 @@ def register_extension_type(ext_type): -------- Define a RationalType extension type subclassing ExtensionType: - >>> import pyarrow as pa - >>> import pyarrow.types as pt >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pt.is_integer(data_type): @@ -2082,7 +2080,7 @@ def register_extension_type(ext_type): ... # No serialized metadata necessary ... return b"" ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): ... # return an instance of this subclass given the serialized ... # metadata ... return RationalType(storage_type[0].type) @@ -2122,8 +2120,6 @@ def unregister_extension_type(type_name): -------- Define a RationalType extension type subclassing ExtensionType: - >>> import pyarrow as pa - >>> import pyarrow.types as pt >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pt.is_integer(data_type): @@ -2143,7 +2139,7 @@ def unregister_extension_type(type_name): ... # No serialized metadata necessary ... return b"" ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): ... # return an instance of this subclass given the serialized ... # metadata ... return RationalType(storage_type[0].type) From 0e051ea858ba95dca6350703ba9e08c6fb745dee Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Sat, 31 Aug 2024 21:40:12 +0000 Subject: [PATCH 007/130] define parameters --- docs/source/python/extending_types.rst | 123 ++++++++++++++++--------- python/pyarrow/types.pxi | 21 ++--- 2 files changed, 88 insertions(+), 56 deletions(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 2f1968deb09c3..a38118de29c44 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -116,61 +116,68 @@ a :class:`~pyarrow.Array` or a :class:`~pyarrow.ChunkedArray`. Defining extension types ("user-defined types") ----------------------------------------------- -Arrow has the notion of extension types in the metadata specification as a -possibility to extend the built-in types. This is done by annotating any of the -built-in Arrow data types (the "storage type") with a custom type name and -optional serialized representation ("ARROW:extension:name" and -"ARROW:extension:metadata" keys in the Field’s custom_metadata of an IPC -message). -See the :ref:`format_metadata_extension_types` section of the metadata -specification for more details. - -Pyarrow allows you to define such extension types from Python by subclassing -:class:`ExtensionType` and giving the derived class its own extension name -and serialization mechanism. The extension name and serialized metadata -can potentially be recognized by other (non-Python) Arrow implementations +Arrow affords a notion of extension types which allow users to annotate data +types with additional semantics. This allows downstream consumers both to +specify custom serialization and deserialization routines (for example, +to :ref:`Python scalars ` and +:ref:`pandas `) and to more easily interpret data. + +In the Arrow :doc:`metadata specification`, +this is accomplished by annotating any of the built-in Arrow data types +(the "storage type") with a custom type name and, optionally, a byte +array that can be used to provide additional metadata (referred to as +"parameters" in this documentation). These appear as the +``ARROW:extension:name`` and ``ARROW:extension:metadata`` keys in the +Field's ``custom_metadata``. + +Note that since these annotations are part of the Arrow specification, +they can potentially be recognized by other (non-Python) Arrow consumers such as PySpark. -For example, we could define a custom rational type for fractions which can -be represented as a pair of integers:: +Pyarrow allows you to define extension types from Python by subclassing +:class:`ExtensionType` and giving the derived class its own extension name +and mechanism to (de)serialize any parameters. For example, we could define +a custom rational type for fractions which can be represented as a pair of +integers:: class RationalType(pa.ExtensionType): - def __init__(self): + def __init__(self, data_type: pa.DataType): + if not pa.types.is_integer(data_type): + raise TypeError(f"data_type must be an integer type not {data_type}") super().__init__( pa.struct( [ - ("numer", pa.int32()), - ("denom", pa.int32()), + ("numer", data_type), + ("denom", data_type), ], ), "my_package.rational", ) def __arrow_ext_serialize__(self) -> bytes: - # No serialized metadata necessary + # No parameters are necessary return b"" @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): # Sanity checks, not required but illustrate the method signature. assert pa.types.is_struct(storage_type) - assert pa.types.is_int32(storage_type[0].type) - assert pa.types.is_int32(storage_type[1].type) - assert serialized == b'' + assert pa.types.is_integer(storage_type[0].type) + assert storage_type[0].type == storage_type[1].type + assert serialized == b"" - # return an instance of this subclass given the serialized - # metadata - return RationalType() + # return an instance of this subclass + return RationalType(storage_type[0].type) The special methods ``__arrow_ext_serialize__`` and ``__arrow_ext_deserialize__`` -define the serialization of an extension type instance. +define the serialization and deserialization of an extension type instance. This can now be used to create arrays and tables holding the extension type:: - >>> rational_type = RationalType() + >>> rational_type = RationalType(pa.int32()) >>> rational_type.extension_name 'my_package.rational' >>> rational_type.storage_type @@ -205,7 +212,7 @@ another Python process. The receiving process must explicitly register the extension type for deserialization, otherwise it will fall back to the storage type:: - >>> pa.register_extension_type(RationalType()) + >>> pa.register_extension_type(RationalType(pa.int32())) For example, creating a RecordBatch and writing it to a stream using the IPC protocol:: @@ -220,19 +227,44 @@ and then reading it back yields the proper type:: >>> with pa.ipc.open_stream(buf) as reader: ... result = reader.read_all() - >>> result.column('ext').type + >>> result.column("ext").type RationalType(StructType(struct)) +Further, note that while we registered the concrete type +``RationalType(pa.int32())``, ``RationalType(integer_type)`` has the same +extension name (``"my_package.rational"``) for all integer types. As such, +the above code will also allow users to (de)serialize these data types:: + + >>> big_rational_type = RationalType(pa.int64()) + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=big_rational_type.storage_type, + ... ) + >>> arr = big_rational_type.wrap_array(storage_array) + >>> batch = pa.RecordBatch.from_arrays([arr], ["ext"]) + >>> sink = pa.BufferOutputStream() + >>> with pa.RecordBatchStreamWriter(sink, batch.schema) as writer: + ... writer.write_batch(batch) + >>> buf = sink.getvalue() + >>> with pa.ipc.open_stream(buf) as reader: + ... result = reader.read_all() + >>> result.column("ext").type + RationalType(StructType(struct)) + The receiving application doesn't need to be Python but can still recognize -the extension type as a "my_package.rational" type, if it has implemented its own +the extension type as a "my_package.rational" type if it has implemented its own extension type to receive it. If the type is not registered in the receiving application, it will fall back to the storage type. Parameterized extension type ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The above example used a fixed storage type with no further metadata. But -more flexible, parameterized extension types are also possible. +The above example illustrated how to construct an extension type that requires +no additional metadata beyond its storage type. But Arrow also provides more +flexible, parameterized extension types. The example given here implements an extension type for the `pandas "period" data type `__, @@ -248,14 +280,14 @@ of the given frequency since 1970. # attributes need to be set first before calling # super init (as that calls serialize) self._freq = freq - super().__init__(pa.int64(), 'my_package.period') + super().__init__(pa.int64(), "my_package.period") @property def freq(self): return self._freq def __arrow_ext_serialize__(self): - return "freq={}".format(self.freq).encode() + return "freq={self.freq}".encode() @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): @@ -263,7 +295,7 @@ of the given frequency since 1970. # metadata. serialized = serialized.decode() assert serialized.startswith("freq=") - freq = serialized.split('=')[1] + freq = serialized.split("=")[1] return PeriodType(freq) Here, we ensure to store all information in the serialized metadata that is @@ -297,7 +329,7 @@ the data as a 2-D Numpy array ``(N, 3)`` without any copy:: super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") def __arrow_ext_serialize__(self): - return b'' + return b"" @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): @@ -336,6 +368,8 @@ This array can be sent over IPC, received in another Python process, and the cus extension array class will be preserved (as long as the receiving process registers the extension type using :func:`register_extension_type` before reading the IPC data). +.. _custom-scalar-conversion: + Custom scalar conversion ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -358,7 +392,7 @@ For example, if we wanted the above example 3D point type to return a custom super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") def __arrow_ext_serialize__(self): - return b'' + return b"" @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): @@ -377,6 +411,7 @@ Arrays built using this extension type now provide scalars that convert to our ` >>> arr.to_pylist() [Point3D(x=1.0, y=2.0, z=3.0), Point3D(x=4.0, y=5.0, z=6.0)] +.. _conversion-to-pandas: Conversion to pandas ~~~~~~~~~~~~~~~~~~~~ @@ -459,16 +494,16 @@ Extension arrays can be used as columns in ``pyarrow.Table`` or >>> data = [ ... pa.array([1, 2, 3]), - ... pa.array(['foo', 'bar', None]), + ... pa.array(["foo", "bar", None]), ... pa.array([True, None, True]), ... tensor_array, ... tensor_array_2 ... ] - >>> my_schema = pa.schema([('f0', pa.int8()), - ... ('f1', pa.string()), - ... ('f2', pa.bool_()), - ... ('tensors_int', tensor_type), - ... ('tensors_float', tensor_type_2)]) + >>> my_schema = pa.schema([("f0", pa.int8()), + ... ("f1", pa.string()), + ... ("f2", pa.bool_()), + ... ("tensors_int", tensor_type), + ... ("tensors_float", tensor_type_2)]) >>> table = pa.Table.from_arrays(data, schema=my_schema) >>> table pyarrow.Table @@ -564,7 +599,7 @@ or .. code-block:: python - >>> tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], dim_names=['C', 'H', 'W']) + >>> tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], dim_names=["C", "H", "W"]) for ``NCHW`` format where: diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 3c3c63f04c8e0..70129f51eee20 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1622,7 +1622,7 @@ cdef class ExtensionType(BaseExtensionType): >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): - ... if not pt.is_integer(data_type): + ... if not pa.types.is_integer(data_type): ... raise TypeError(f"data_type must be an integer type not {data_type}") ... super().__init__( ... pa.struct( @@ -1636,12 +1636,11 @@ cdef class ExtensionType(BaseExtensionType): ... "my_package.rational", ... ) ... def __arrow_ext_serialize__(self) -> bytes: - ... # No serialized metadata necessary + ... # No parameters are necessary ... return b"" ... @classmethod ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata + ... # return an instance of this subclass ... return RationalType(storage_type[0].type) Register the extension type: @@ -2063,7 +2062,7 @@ def register_extension_type(ext_type): >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): - ... if not pt.is_integer(data_type): + ... if not pa.types.is_integer(data_type): ... raise TypeError(f"data_type must be an integer type not {data_type}") ... super().__init__( ... pa.struct( @@ -2077,12 +2076,11 @@ def register_extension_type(ext_type): ... "my_package.rational", ... ) ... def __arrow_ext_serialize__(self) -> bytes: - ... # No serialized metadata necessary + ... # No parameters are necessary ... return b"" ... @classmethod ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata + ... # return an instance of this subclass ... return RationalType(storage_type[0].type) Register the extension type: @@ -2122,7 +2120,7 @@ def unregister_extension_type(type_name): >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): - ... if not pt.is_integer(data_type): + ... if not pa.types.is_integer(data_type): ... raise TypeError(f"data_type must be an integer type not {data_type}") ... super().__init__( ... pa.struct( @@ -2136,12 +2134,11 @@ def unregister_extension_type(type_name): ... "my_package.rational", ... ) ... def __arrow_ext_serialize__(self) -> bytes: - ... # No serialized metadata necessary + ... # No parameters are necessary ... return b"" ... @classmethod ... def __arrow_ext_deserialize__(cls, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata + ... # return an instance of this subclass ... return RationalType(storage_type[0].type) Register the extension type: From 8614fd5d793056fd74a43e86f586eb6c0f808223 Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Sun, 8 Sep 2024 12:00:49 -0400 Subject: [PATCH 008/130] Update python/pyarrow/types.pxi Co-authored-by: Ian Cook --- python/pyarrow/types.pxi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 70129f51eee20..33dc638124b8e 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1686,8 +1686,8 @@ cdef class ExtensionType(BaseExtensionType): >>> pa.unregister_extension_type("my_package.rational") Note that even though we registered the concrete type - ``RationalType(pa.int64())``, pyarrow will be able to deserialize - ``RationalType(integer_type)`` for any ``integer_type`` as the deserializer + ``RationalType(pa.int64())``, PyArrow will be able to deserialize + ``RationalType(integer_type)`` for any ``integer_type``, as the deserializer will reference the name ``my_package.rational`` and the ``@classmethod`` ``__arrow_ext_deserialize__``. """ From 03b3fd992ce9208d95e507ce50f2f7ecfc788902 Mon Sep 17 00:00:00 2001 From: Kevin Wilson Date: Sun, 8 Sep 2024 12:01:13 -0400 Subject: [PATCH 009/130] Update docs/source/python/extending_types.rst Co-authored-by: Ian Cook --- docs/source/python/extending_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index a38118de29c44..328840c43dde4 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -134,7 +134,7 @@ Note that since these annotations are part of the Arrow specification, they can potentially be recognized by other (non-Python) Arrow consumers such as PySpark. -Pyarrow allows you to define extension types from Python by subclassing +PyArrow allows you to define extension types from Python by subclassing :class:`ExtensionType` and giving the derived class its own extension name and mechanism to (de)serialize any parameters. For example, we could define a custom rational type for fractions which can be represented as a pair of From 2abdcb82391c00f32e57dcd73dcf8e600d411178 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Sun, 8 Sep 2024 16:21:47 +0000 Subject: [PATCH 010/130] more edits --- docs/source/format/Columnar.rst | 12 ++++++------ docs/source/python/extending_types.rst | 15 ++++++++------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index c5f822f41643f..da20830ff743e 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -1596,12 +1596,12 @@ structure. These extension keys are: they should not be used for third-party extension types. This extension metadata can annotate any of the built-in Arrow logical -types. The intent is that an implementation that does not support an -extension type can still handle the underlying data. For example a -16-byte UUID value could be embedded in ``FixedSizeBinary(16)``, and -implementations that do not have this extension type can still work -with the underlying binary values and pass along the -``custom_metadata`` in subsequent Arrow protocol messages. +types. For example, Arrow specifies a canonical extension type that +represents a UUID as a FixedSizeBinary(16). Arrow implementations are +not required to support canonical extensions, so an implementation that +does not support this UUID type will simply interpret it as a +``FixedSizeBinary(16)`` and pass along the ``custom_metadata`` in +subsequent Arrow protocol messages. Extension types may or may not use the ``'ARROW:extension:metadata'`` field. Let's consider some example diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 328840c43dde4..c14a2e65fa2c6 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -117,13 +117,13 @@ Defining extension types ("user-defined types") ----------------------------------------------- Arrow affords a notion of extension types which allow users to annotate data -types with additional semantics. This allows downstream consumers both to +types with additional semantics. This allows developers both to specify custom serialization and deserialization routines (for example, to :ref:`Python scalars ` and :ref:`pandas `) and to more easily interpret data. -In the Arrow :doc:`metadata specification`, -this is accomplished by annotating any of the built-in Arrow data types +In Arrow, :ref:`extension types ` +are specified by annotating any of the built-in Arrow data types (the "storage type") with a custom type name and, optionally, a byte array that can be used to provide additional metadata (referred to as "parameters" in this documentation). These appear as the @@ -231,9 +231,10 @@ and then reading it back yields the proper type:: RationalType(StructType(struct)) Further, note that while we registered the concrete type -``RationalType(pa.int32())``, ``RationalType(integer_type)`` has the same -extension name (``"my_package.rational"``) for all integer types. As such, -the above code will also allow users to (de)serialize these data types:: +``RationalType(pa.int32())``, the same extension name +(``"my_package.rational"``) is used by ``RationalType(integer_type)`` +for *all* Arrow integer types. As such, the above code also allows users to +(de)serialize these data types:: >>> big_rational_type = RationalType(pa.int64()) >>> storage_array = pa.array( @@ -287,7 +288,7 @@ of the given frequency since 1970. return self._freq def __arrow_ext_serialize__(self): - return "freq={self.freq}".encode() + return "freq={}".format(self.freq).encode() @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): From 4c66e4d71627acad27c93f5fea188f3764d8a434 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Sun, 8 Sep 2024 16:27:56 +0000 Subject: [PATCH 011/130] missed one formatting --- docs/source/format/Columnar.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index da20830ff743e..4c758c5294325 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -1597,7 +1597,7 @@ structure. These extension keys are: This extension metadata can annotate any of the built-in Arrow logical types. For example, Arrow specifies a canonical extension type that -represents a UUID as a FixedSizeBinary(16). Arrow implementations are +represents a UUID as a ``FixedSizeBinary(16)``. Arrow implementations are not required to support canonical extensions, so an implementation that does not support this UUID type will simply interpret it as a ``FixedSizeBinary(16)`` and pass along the ``custom_metadata`` in From 253156baac193a9f3b791fe4459da624538606b5 Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Sun, 8 Sep 2024 16:30:59 +0000 Subject: [PATCH 012/130] import pyarrow in doctests examples --- python/pyarrow/types.pxi | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 33dc638124b8e..604867d47349c 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1620,6 +1620,7 @@ cdef class ExtensionType(BaseExtensionType): -------- Define a RationalType extension type subclassing ExtensionType: + >>> import pyarrow as pa >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pa.types.is_integer(data_type): From da6cdf29dbe5acae3614c54160fa99390373066f Mon Sep 17 00:00:00 2001 From: Kevin H Wilson Date: Sun, 8 Sep 2024 18:49:11 +0000 Subject: [PATCH 013/130] import pyarrow in more doctests examples --- python/pyarrow/types.pxi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 604867d47349c..64a984731c1c2 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2061,6 +2061,7 @@ def register_extension_type(ext_type): -------- Define a RationalType extension type subclassing ExtensionType: + >>> import pyarrow as pa >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pa.types.is_integer(data_type): @@ -2119,6 +2120,7 @@ def unregister_extension_type(type_name): -------- Define a RationalType extension type subclassing ExtensionType: + >>> import pyarrow as pa >>> class RationalType(pa.ExtensionType): ... def __init__(self, data_type: pa.DataType): ... if not pa.types.is_integer(data_type): From d31d59b45b7353982f7846bd16a05522ec2b90af Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 12:38:38 +0900 Subject: [PATCH 014/130] MINOR: [Java] Bump dep.slf4j.version from 2.0.13 to 2.0.16 in /java (#43652) Bumps `dep.slf4j.version` from 2.0.13 to 2.0.16. Updates `org.slf4j:slf4j-api` from 2.0.13 to 2.0.16 Updates `org.slf4j:slf4j-jdk14` from 2.0.13 to 2.0.16 Updates `org.slf4j:jul-to-slf4j` from 2.0.13 to 2.0.16 Updates `org.slf4j:jcl-over-slf4j` from 2.0.13 to 2.0.16 Updates `org.slf4j:log4j-over-slf4j` from 2.0.13 to 2.0.16 Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index a73453df68fd2..54bb7a0ae0eb9 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -94,7 +94,7 @@ under the License. ${project.build.directory}/generated-sources 1.9.0 5.10.3 - 2.0.13 + 2.0.16 33.2.1-jre 4.1.112.Final 1.66.0 From 8855c59b82d006f9e02513d9c83872af37a7ecde Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Sun, 25 Aug 2024 21:33:51 -0700 Subject: [PATCH 015/130] MINOR: [R] Add missing PR num to news.md item (#43811) ### Rationale for this change We normally link to somewhere to give the user more context on news items. I noticed the link was missing for this one. ### What changes are included in this PR? Added PR number to news item. ### Are these changes tested? No. ### Are there any user-facing changes? No. Authored-by: Bryce Mecum Signed-off-by: Jacob Wujciak-Jens --- r/NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index 0e6e4634a0af8..b9568afe66542 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -32,7 +32,7 @@ functions (UDFs); for UDFs, see `register_scalar_function()`. (#41223) * `mutate()` expressions can now include aggregations, such as `x - mean(x)`. (#41350) * `summarize()` supports more complex expressions, and correctly handles cases - where column names are reused in expressions. + where column names are reused in expressions. (#41223) * The `na_matches` argument to the `dplyr::*_join()` functions is now supported. This argument controls whether `NA` values are considered equal when joining. (#41358) * R metadata, stored in the Arrow schema to support round-tripping data between From 7bc2e018d551abdfd18f03f8f3b6a2de7da7e576 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:20:20 +0900 Subject: [PATCH 016/130] MINOR: [Java] Bump dep.junit.jupiter.version from 5.10.3 to 5.11.0 in /java (#43751) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps `dep.junit.jupiter.version` from 5.10.3 to 5.11.0. Updates `org.junit.jupiter:junit-jupiter-engine` from 5.10.3 to 5.11.0
Release notes

Sourced from org.junit.jupiter:junit-jupiter-engine's releases.

JUnit 5.11.0 = Platform 1.11.0 + Jupiter 5.11.0 + Vintage 5.11.0

See Release Notes.

New Contributors

Full Changelog: https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0

JUnit 5.11.0-RC1 = Platform 1.11.0-RC1 + Jupiter 5.11.0-RC1 + Vintage 5.11.0-RC1

See Release Notes.

New Contributors

Full Changelog: https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1

JUnit 5.11.0-M2 = Platform 1.11.0-M2 + Jupiter 5.11.0-M2 + Vintage 5.11.0-M2

See Release Notes.

New Contributors

Full Changelog: https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2

JUnit 5.11.0-M1 = Platform 1.11.0-M1 + Jupiter 5.11.0-M1 + Vintage 5.11.0-M1

... (truncated)

Commits
  • 6b8e42b Release 5.11
  • 9430ece Allow potentially unlimited maxCharsPerColumn in Csv{File}Source (#3924)
  • 0b10f86 Polish release notes
  • 4dbd0f9 Let @ TempDir fail fast with File annotated element and non-default file s...
  • 57f1ad4 Fix syntax
  • d78730a Prioritize tasks on critical path of task graph
  • b6719e2 Remove obsolete directory
  • d8ec757 Apply Spotless formatting to Gradle script plugins
  • dae525d Disable caching of some Spotless tasks due to negative avoidance savings
  • c63d118 Re-enable caching verifyOSGi tasks (issue was fixed in bnd 7.0.0)
  • Additional commits viewable in compare view

Updates `org.junit.jupiter:junit-jupiter-api` from 5.10.3 to 5.11.0
Release notes

Sourced from org.junit.jupiter:junit-jupiter-api's releases.

JUnit 5.11.0 = Platform 1.11.0 + Jupiter 5.11.0 + Vintage 5.11.0

See Release Notes.

New Contributors

Full Changelog: https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0

JUnit 5.11.0-RC1 = Platform 1.11.0-RC1 + Jupiter 5.11.0-RC1 + Vintage 5.11.0-RC1

See Release Notes.

New Contributors

Full Changelog: https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1

JUnit 5.11.0-M2 = Platform 1.11.0-M2 + Jupiter 5.11.0-M2 + Vintage 5.11.0-M2

See Release Notes.

New Contributors

Full Changelog: https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2

JUnit 5.11.0-M1 = Platform 1.11.0-M1 + Jupiter 5.11.0-M1 + Vintage 5.11.0-M1

... (truncated)

Commits
  • 6b8e42b Release 5.11
  • 9430ece Allow potentially unlimited maxCharsPerColumn in Csv{File}Source (#3924)
  • 0b10f86 Polish release notes
  • 4dbd0f9 Let @ TempDir fail fast with File annotated element and non-default file s...
  • 57f1ad4 Fix syntax
  • d78730a Prioritize tasks on critical path of task graph
  • b6719e2 Remove obsolete directory
  • d8ec757 Apply Spotless formatting to Gradle script plugins
  • dae525d Disable caching of some Spotless tasks due to negative avoidance savings
  • c63d118 Re-enable caching verifyOSGi tasks (issue was fixed in bnd 7.0.0)
  • Additional commits viewable in compare view

Updates `org.junit.jupiter:junit-jupiter-params` from 5.10.3 to 5.11.0
Release notes

Sourced from org.junit.jupiter:junit-jupiter-params's releases.

JUnit 5.11.0 = Platform 1.11.0 + Jupiter 5.11.0 + Vintage 5.11.0

See Release Notes.

New Contributors

Full Changelog: https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0

JUnit 5.11.0-RC1 = Platform 1.11.0-RC1 + Jupiter 5.11.0-RC1 + Vintage 5.11.0-RC1

See Release Notes.

New Contributors

Full Changelog: https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1

JUnit 5.11.0-M2 = Platform 1.11.0-M2 + Jupiter 5.11.0-M2 + Vintage 5.11.0-M2

See Release Notes.

New Contributors

Full Changelog: https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2

JUnit 5.11.0-M1 = Platform 1.11.0-M1 + Jupiter 5.11.0-M1 + Vintage 5.11.0-M1

... (truncated)

Commits
  • 6b8e42b Release 5.11
  • 9430ece Allow potentially unlimited maxCharsPerColumn in Csv{File}Source (#3924)
  • 0b10f86 Polish release notes
  • 4dbd0f9 Let @ TempDir fail fast with File annotated element and non-default file s...
  • 57f1ad4 Fix syntax
  • d78730a Prioritize tasks on critical path of task graph
  • b6719e2 Remove obsolete directory
  • d8ec757 Apply Spotless formatting to Gradle script plugins
  • dae525d Disable caching of some Spotless tasks due to negative avoidance savings
  • c63d118 Re-enable caching verifyOSGi tasks (issue was fixed in bnd 7.0.0)
  • Additional commits viewable in compare view

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 54bb7a0ae0eb9..77feed12f3f1d 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -93,7 +93,7 @@ under the License. ${project.build.directory}/generated-sources 1.9.0 - 5.10.3 + 5.11.0 2.0.16 33.2.1-jre 4.1.112.Final From 20f8357c157292d50bbe0ffa15200a56d7eb972c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 26 Aug 2024 16:34:18 +0200 Subject: [PATCH 017/130] GH-15058: [C++][Python] Native support for UUID (#37298) ### Rationale for this change See #15058. UUID datatype is common in throughout the ecosystem and Arrow as supporting it as a native type would reduce friction. ### What changes are included in this PR? This PR implements logic for Arrow canonical extension type in C++ and a Python wrapper. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes, new extension type is added. * Closes: #15058 Authored-by: Rok Mihevc Signed-off-by: Antoine Pitrou --- cpp/src/arrow/CMakeLists.txt | 3 +- cpp/src/arrow/acero/hash_join_node_test.cc | 1 + cpp/src/arrow/extension/CMakeLists.txt | 2 +- .../extension/fixed_shape_tensor_test.cc | 17 +-- cpp/src/arrow/extension/uuid.cc | 58 ++++++++++ cpp/src/arrow/extension/uuid.h | 61 ++++++++++ cpp/src/arrow/extension/uuid_test.cc | 72 ++++++++++++ cpp/src/arrow/extension_type.cc | 4 +- cpp/src/arrow/extension_type_test.cc | 19 +--- .../integration/json_integration_test.cc | 2 +- cpp/src/arrow/ipc/test_common.cc | 35 ++++-- cpp/src/arrow/ipc/test_common.h | 3 + cpp/src/arrow/scalar_test.cc | 5 +- cpp/src/arrow/testing/extension_type.h | 6 +- cpp/src/arrow/testing/gtest_util.cc | 16 ++- dev/archery/archery/integration/datagen.py | 2 +- docs/source/format/CanonicalExtensions.rst | 2 + docs/source/status.rst | 2 +- python/pyarrow/__init__.py | 18 +-- python/pyarrow/array.pxi | 6 + python/pyarrow/includes/libarrow.pxd | 10 ++ python/pyarrow/lib.pxd | 3 + python/pyarrow/public-api.pxi | 11 +- python/pyarrow/scalar.pxi | 10 ++ python/pyarrow/src/arrow/python/gdb.cc | 27 +---- python/pyarrow/tests/extensions.pyx | 2 +- python/pyarrow/tests/test_extension_type.py | 105 ++++++++++++------ python/pyarrow/tests/test_gdb.py | 8 +- python/pyarrow/types.pxi | 34 ++++++ 29 files changed, 412 insertions(+), 132 deletions(-) create mode 100644 cpp/src/arrow/extension/uuid.cc create mode 100644 cpp/src/arrow/extension/uuid.h create mode 100644 cpp/src/arrow/extension/uuid_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 89f28ee416ede..6b0ac8c23c75a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -375,6 +375,7 @@ set(ARROW_SRCS device.cc extension_type.cc extension/bool8.cc + extension/uuid.cc pretty_print.cc record_batch.cc result.cc @@ -1225,6 +1226,7 @@ add_subdirectory(testing) add_subdirectory(array) add_subdirectory(c) add_subdirectory(compute) +add_subdirectory(extension) add_subdirectory(io) add_subdirectory(tensor) add_subdirectory(util) @@ -1267,7 +1269,6 @@ endif() if(ARROW_JSON) add_subdirectory(json) - add_subdirectory(extension) endif() if(ARROW_ORC) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 9065e286a2228..76ad9c7d650eb 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -29,6 +29,7 @@ #include "arrow/compute/kernels/test_util.h" #include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/row_encoder_internal.h" +#include "arrow/extension/uuid.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 5cb4bc77af2a4..065ea3f1ddb16 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -set(CANONICAL_EXTENSION_TESTS bool8_test.cc) +set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc) if(ARROW_JSON) list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 3fd39a11ff50d..842a78e1a4f7a 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -23,7 +23,7 @@ #include "arrow/array/array_primitive.h" #include "arrow/io/memory.h" #include "arrow/ipc/reader.h" -#include "arrow/ipc/writer.h" +#include "arrow/ipc/test_common.h" #include "arrow/record_batch.h" #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" @@ -33,6 +33,7 @@ namespace arrow { using FixedShapeTensorType = extension::FixedShapeTensorType; +using arrow::ipc::test::RoundtripBatch; using extension::fixed_shape_tensor; using extension::FixedShapeTensorArray; @@ -71,20 +72,6 @@ class TestExtensionType : public ::testing::Test { std::string serialized_; }; -auto RoundtripBatch = [](const std::shared_ptr& batch, - std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); - ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), - out_stream.get())); - - ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); - - io::BufferReader reader(complete_ipc_stream); - std::shared_ptr batch_reader; - ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); - ASSERT_OK(batch_reader->ReadNext(out)); -}; - TEST_F(TestExtensionType, CheckDummyRegistration) { // We need a registered dummy type at runtime to allow for IPC deserialization auto registered_type = GetExtensionType("arrow.fixed_shape_tensor"); diff --git a/cpp/src/arrow/extension/uuid.cc b/cpp/src/arrow/extension/uuid.cc new file mode 100644 index 0000000000000..43b917a17f8b2 --- /dev/null +++ b/cpp/src/arrow/extension/uuid.cc @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/extension_type.h" +#include "arrow/util/logging.h" + +#include "arrow/extension/uuid.h" + +namespace arrow::extension { + +bool UuidType::ExtensionEquals(const ExtensionType& other) const { + return (other.extension_name() == this->extension_name()); +} + +std::shared_ptr UuidType::MakeArray(std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.uuid", + static_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> UuidType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized) const { + if (!serialized.empty()) { + return Status::Invalid("Unexpected serialized metadata: '", serialized, "'"); + } + if (!storage_type->Equals(*fixed_size_binary(16))) { + return Status::Invalid("Invalid storage type for UuidType: ", + storage_type->ToString()); + } + return std::make_shared(); +} + +std::string UuidType::ToString(bool show_metadata) const { + std::stringstream ss; + ss << "extension<" << this->extension_name() << ">"; + return ss.str(); +} + +std::shared_ptr uuid() { return std::make_shared(); } + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/uuid.h b/cpp/src/arrow/extension/uuid.h new file mode 100644 index 0000000000000..42bb21cf0b2ed --- /dev/null +++ b/cpp/src/arrow/extension/uuid.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/extension_type.h" + +namespace arrow::extension { + +/// \brief UuidArray stores array of UUIDs. Underlying storage type is +/// FixedSizeBinary(16). +class ARROW_EXPORT UuidArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief UuidType is a canonical arrow extension type for UUIDs. +/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this +/// does not interpret the bytes in any way. Specific UUID version is not +/// required or guaranteed. +class ARROW_EXPORT UuidType : public ExtensionType { + public: + /// \brief Construct a UuidType. + UuidType() : ExtensionType(fixed_size_binary(16)) {} + + std::string extension_name() const override { return "arrow.uuid"; } + std::string ToString(bool show_metadata = false) const override; + + bool ExtensionEquals(const ExtensionType& other) const override; + + /// Create a UuidArray from ArrayData + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const override; + + std::string Serialize() const override { return ""; } + + /// \brief Create a UuidType instance + static Result> Make() { return std::make_shared(); } +}; + +/// \brief Return a UuidType instance. +ARROW_EXPORT std::shared_ptr uuid(); + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc new file mode 100644 index 0000000000000..3bbb6eeb4aef1 --- /dev/null +++ b/cpp/src/arrow/extension/uuid_test.cc @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/uuid.h" + +#include "arrow/testing/matchers.h" + +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/test_common.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/key_value_metadata.h" + +#include "arrow/testing/extension_type.h" + +namespace arrow { + +using arrow::ipc::test::RoundtripBatch; + +TEST(TestUuuidExtensionType, ExtensionTypeTest) { + auto type = uuid(); + ASSERT_EQ(type->id(), Type::EXTENSION); + + const auto& ext_type = static_cast(*type); + std::string serialized = ext_type.Serialize(); + + ASSERT_OK_AND_ASSIGN(auto deserialized, + ext_type.Deserialize(fixed_size_binary(16), serialized)); + ASSERT_TRUE(deserialized->Equals(*type)); + ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16))); +} + +TEST(TestUuuidExtensionType, RoundtripBatch) { + auto ext_type = extension::uuid(); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto arr = ArrayFromJSON(fixed_size_binary(16), R"(["abcdefghijklmnop", null])"); + auto ext_arr = ExtensionType::WrapArray(ext_type, arr); + + // Pass extension array, expect getting back extension array + std::shared_ptr read_batch; + auto ext_field = field(/*name=*/"f0", /*type=*/ext_type); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); + RoundtripBatch(batch, &read_batch); + CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); + + // Pass extension metadata and storage array, expect getting back extension array + std::shared_ptr read_batch2; + auto ext_metadata = + key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, + {"ARROW:extension:metadata", ""}}); + ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(), + /*nullable=*/true, /*metadata=*/ext_metadata); + auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr}); + RoundtripBatch(batch2, &read_batch2); + CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); +} + +} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 83c7ebed4f319..fc220f73a6beb 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -32,6 +32,7 @@ #include "arrow/extension/fixed_shape_tensor.h" #include "arrow/extension/opaque.h" #endif +#include "arrow/extension/uuid.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -147,14 +148,13 @@ static void CreateGlobalRegistry() { // Register canonical extension types g_registry = std::make_shared(); - std::vector> ext_types{extension::bool8()}; + std::vector> ext_types{extension::bool8(), extension::uuid()}; #ifdef ARROW_JSON ext_types.push_back(extension::fixed_shape_tensor(int64(), {})); ext_types.push_back(extension::opaque(null(), "", "")); #endif - // Register canonical extension types for (const auto& ext_type : ext_types) { ARROW_CHECK_OK( g_registry->RegisterType(checked_pointer_cast(ext_type))); diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index f104c984a64b4..f49ffc5cba553 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -30,6 +30,7 @@ #include "arrow/io/memory.h" #include "arrow/ipc/options.h" #include "arrow/ipc/reader.h" +#include "arrow/ipc/test_common.h" #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" #include "arrow/status.h" @@ -41,6 +42,8 @@ namespace arrow { +using arrow::ipc::test::RoundtripBatch; + class Parametric1Array : public ExtensionArray { public: using ExtensionArray::ExtensionArray; @@ -178,7 +181,7 @@ class ExtStructType : public ExtensionType { class TestExtensionType : public ::testing::Test { public: - void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared())); } + void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared())); } void TearDown() { if (GetExtensionType("uuid")) { @@ -211,20 +214,6 @@ TEST_F(TestExtensionType, ExtensionTypeTest) { ASSERT_EQ(deserialized->byte_width(), 16); } -auto RoundtripBatch = [](const std::shared_ptr& batch, - std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); - ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), - out_stream.get())); - - ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); - - io::BufferReader reader(complete_ipc_stream); - std::shared_ptr batch_reader; - ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); - ASSERT_OK(batch_reader->ReadNext(out)); -}; - TEST_F(TestExtensionType, IpcRoundtrip) { auto ext_arr = ExampleUuid(); auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr}); diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc index 9b56928c68843..0e84ea6124d5d 100644 --- a/cpp/src/arrow/integration/json_integration_test.cc +++ b/cpp/src/arrow/integration/json_integration_test.cc @@ -1046,7 +1046,7 @@ TEST(TestJsonFileReadWrite, JsonExample2) { auto storage_array = ArrayFromJSON(fixed_size_binary(16), R"(["0123456789abcdef", null])"); - AssertArraysEqual(*batch->column(0), UuidArray(uuid_type, storage_array)); + AssertArraysEqual(*batch->column(0), ExampleUuidArray(uuid_type, storage_array)); AssertArraysEqual(*batch->column(1), NullArray(2)); } diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 87c02e2d87a1e..fb4f6bd8eadcf 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -27,8 +27,10 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" -#include "arrow/array/builder_time.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" #include "arrow/ipc/test_common.h" +#include "arrow/ipc/writer.h" #include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" @@ -242,11 +244,11 @@ Status MakeRandomBooleanArray(const int length, bool include_nulls, std::shared_ptr* out) { std::vector values(length); random_null_bytes(length, 0.5, values.data()); - ARROW_ASSIGN_OR_RAISE(auto data, internal::BytesToBits(values)); + ARROW_ASSIGN_OR_RAISE(auto data, arrow::internal::BytesToBits(values)); if (include_nulls) { std::vector valid_bytes(length); - ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(valid_bytes)); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, arrow::internal::BytesToBits(valid_bytes)); random_null_bytes(length, 0.1, valid_bytes.data()); *out = std::make_shared(length, data, null_bitmap, -1); } else { @@ -596,7 +598,7 @@ Status MakeStruct(std::shared_ptr* out) { std::shared_ptr no_nulls(new StructArray(type, list_batch->num_rows(), columns)); std::vector null_bytes(list_batch->num_rows(), 1); null_bytes[0] = 0; - ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes)); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, arrow::internal::BytesToBits(null_bytes)); std::shared_ptr with_nulls( new StructArray(type, list_batch->num_rows(), columns, null_bitmap, 1)); @@ -1088,9 +1090,9 @@ Status MakeUuid(std::shared_ptr* out) { auto f1 = field("f1", uuid_type, /*nullable=*/false); auto schema = ::arrow::schema({f0, f1}); - auto a0 = std::make_shared( + auto a0 = std::make_shared( uuid_type, ArrayFromJSON(storage_type, R"(["0123456789abcdef", null])")); - auto a1 = std::make_shared( + auto a1 = std::make_shared( uuid_type, ArrayFromJSON(storage_type, R"(["ZYXWVUTSRQPONMLK", "JIHGFEDBA9876543"])")); @@ -1176,12 +1178,13 @@ enable_if_t::value, void> FillRandomData( Status MakeRandomTensor(const std::shared_ptr& type, const std::vector& shape, bool row_major_p, std::shared_ptr* out, uint32_t seed) { - const auto& element_type = internal::checked_cast(*type); + const auto& element_type = arrow::internal::checked_cast(*type); std::vector strides; if (row_major_p) { - RETURN_NOT_OK(internal::ComputeRowMajorStrides(element_type, shape, &strides)); + RETURN_NOT_OK(arrow::internal::ComputeRowMajorStrides(element_type, shape, &strides)); } else { - RETURN_NOT_OK(internal::ComputeColumnMajorStrides(element_type, shape, &strides)); + RETURN_NOT_OK( + arrow::internal::ComputeColumnMajorStrides(element_type, shape, &strides)); } const int64_t element_size = element_type.bit_width() / CHAR_BIT; @@ -1233,6 +1236,20 @@ Status MakeRandomTensor(const std::shared_ptr& type, return Tensor::Make(type, buf, shape, strides).Value(out); } +void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(out)); +} + } // namespace test } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index db8613cbb1e6a..9b7e7f13e3a8e 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -184,6 +184,9 @@ Status MakeRandomTensor(const std::shared_ptr& type, const std::vector& shape, bool row_major_p, std::shared_ptr* out, uint32_t seed = 0); +ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out); + } // namespace test } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 104a5697b5727..e9ec13e98b4ee 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -43,7 +43,6 @@ namespace arrow { using compute::Cast; using compute::CastOptions; - using internal::checked_cast; using internal::checked_pointer_cast; @@ -2038,7 +2037,7 @@ class TestExtensionScalar : public ::testing::Test { void SetUp() { type_ = uuid(); storage_type_ = fixed_size_binary(16); - uuid_type_ = checked_cast(type_.get()); + uuid_type_ = checked_cast(type_.get()); } protected: @@ -2049,7 +2048,7 @@ class TestExtensionScalar : public ::testing::Test { } std::shared_ptr type_, storage_type_; - const UuidType* uuid_type_{nullptr}; + const ExampleUuidType* uuid_type_{nullptr}; const std::string_view uuid_string1_{UUID_STRING1}; const std::string_view uuid_string2_{UUID_STRING2}; diff --git a/cpp/src/arrow/testing/extension_type.h b/cpp/src/arrow/testing/extension_type.h index 6515631f202ae..a4526e31c2b93 100644 --- a/cpp/src/arrow/testing/extension_type.h +++ b/cpp/src/arrow/testing/extension_type.h @@ -27,14 +27,14 @@ namespace arrow { -class ARROW_TESTING_EXPORT UuidArray : public ExtensionArray { +class ARROW_TESTING_EXPORT ExampleUuidArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; }; -class ARROW_TESTING_EXPORT UuidType : public ExtensionType { +class ARROW_TESTING_EXPORT ExampleUuidType : public ExtensionType { public: - UuidType() : ExtensionType(fixed_size_binary(16)) {} + ExampleUuidType() : ExtensionType(fixed_size_binary(16)) {} std::string extension_name() const override { return "uuid"; } diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 95de16c715f19..ae2e53b30a3ee 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -49,9 +49,13 @@ #include "arrow/buffer.h" #include "arrow/compute/api_vector.h" #include "arrow/datum.h" +#include "arrow/io/memory.h" #include "arrow/ipc/json_simple.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" #include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep #include "arrow/pretty_print.h" +#include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/tensor.h" @@ -847,17 +851,17 @@ Future<> SleepABitAsync() { /////////////////////////////////////////////////////////////////////////// // Extension types -bool UuidType::ExtensionEquals(const ExtensionType& other) const { +bool ExampleUuidType::ExtensionEquals(const ExtensionType& other) const { return (other.extension_name() == this->extension_name()); } -std::shared_ptr UuidType::MakeArray(std::shared_ptr data) const { +std::shared_ptr ExampleUuidType::MakeArray(std::shared_ptr data) const { DCHECK_EQ(data->type->id(), Type::EXTENSION); DCHECK_EQ("uuid", static_cast(*data->type).extension_name()); - return std::make_shared(data); + return std::make_shared(data); } -Result> UuidType::Deserialize( +Result> ExampleUuidType::Deserialize( std::shared_ptr storage_type, const std::string& serialized) const { if (serialized != "uuid-serialized") { return Status::Invalid("Type identifier did not match: '", serialized, "'"); @@ -866,7 +870,7 @@ Result> UuidType::Deserialize( return Status::Invalid("Invalid storage type for UuidType: ", storage_type->ToString()); } - return std::make_shared(); + return std::make_shared(); } bool SmallintType::ExtensionEquals(const ExtensionType& other) const { @@ -982,7 +986,7 @@ Result> Complex128Type::Deserialize( return std::make_shared(); } -std::shared_ptr uuid() { return std::make_shared(); } +std::shared_ptr uuid() { return std::make_shared(); } std::shared_ptr smallint() { return std::make_shared(); } diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index d395d26cb71d3..f63aa0d95a484 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1845,7 +1845,7 @@ def generate_nested_dictionary_case(): def generate_extension_case(): dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0') - uuid_type = ExtensionType('uuid', 'uuid-serialized', + uuid_type = ExtensionType('arrow.uuid', '', FixedSizeBinaryField('', 16)) dict_ext_type = ExtensionType( 'dict-extension', 'dict-extension-serialized', diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 5658f949ceeaa..1106f8aaffdd3 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -272,6 +272,8 @@ JSON In the future, additional fields may be added, but they are not required to interpret the array. +.. _uuid_extension: + UUID ==== diff --git a/docs/source/status.rst b/docs/source/status.rst index 5e2c2cc19c890..b685d4bbf8add 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -121,7 +121,7 @@ Data Types +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | JSON | | | ✓ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| UUID | | | ✓ | | | | | | +| UUID | ✓ | | ✓ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | 8-bit Boolean | ✓ | | ✓ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 807bcdc315036..d31c93119b73a 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -172,9 +172,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, - fixed_shape_tensor, - opaque, - bool8, + bool8, fixed_shape_tensor, opaque, uuid, field, type_for_alias, DataType, DictionaryType, StructType, @@ -184,8 +182,9 @@ def print_entry(label, value): TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, - RunEndEncodedType, FixedShapeTensorType, OpaqueType, - Bool8Type, PyExtensionType, UnknownExtensionType, + RunEndEncodedType, Bool8Type, FixedShapeTensorType, + OpaqueType, UuidType, + PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, KeyValueMetadata, @@ -218,8 +217,9 @@ def print_entry(label, value): Time32Array, Time64Array, DurationArray, MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, - RunEndEncodedArray, FixedShapeTensorArray, OpaqueArray, - Bool8Array, scalar, NA, _NULL as NULL, Scalar, + RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, + OpaqueArray, UuidArray, + scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, @@ -235,8 +235,8 @@ def print_entry(label, value): StringScalar, LargeStringScalar, StringViewScalar, FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, - RunEndEncodedScalar, ExtensionScalar, - FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar) + RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, + FixedShapeTensorScalar, OpaqueScalar, UuidScalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 77d6c9c06d2de..1587de0e6b744 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4338,6 +4338,12 @@ cdef class ExtensionArray(Array): return result +class UuidArray(ExtensionArray): + """ + Concrete class for Arrow arrays of UUID data type. + """ + + cdef class FixedShapeTensorArray(ExtensionArray): """ Concrete class for fixed shape tensor extension arrays. diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6f510cfc0c06c..c2346750a196f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2865,6 +2865,16 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() +cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil: + cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make() + + cdef cppclass CUuidArray" arrow::extension::UuidArray"(CExtensionArray): + pass + + cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil: cdef cppclass CFixedShapeTensorType \ " arrow::extension::FixedShapeTensorType"(CExtensionType): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index a7c3b496a0045..5c3d981c3adc7 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -222,6 +222,9 @@ cdef class OpaqueType(BaseExtensionType): cdef: const COpaqueType* opaque_ext_type +cdef class UuidType(BaseExtensionType): + cdef: + const CUuidType* uuid_ext_type cdef class PyExtensionType(ExtensionType): pass diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 19a26bd6c683d..d3e2ff2e99d91 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -120,14 +120,17 @@ cdef api object pyarrow_wrap_data_type( elif type.get().id() == _Type_EXTENSION: ext_type = type.get() cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type) + extension_name = ext_type.extension_name() if cpy_ext_type != nullptr: return cpy_ext_type.GetInstance() - elif ext_type.extension_name() == b"arrow.fixed_shape_tensor": + elif extension_name == b"arrow.bool8": + out = Bool8Type.__new__(Bool8Type) + elif extension_name == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) - elif ext_type.extension_name() == b"arrow.opaque": + elif extension_name == b"arrow.opaque": out = OpaqueType.__new__(OpaqueType) - elif ext_type.extension_name() == b"arrow.bool8": - out = Bool8Type.__new__(Bool8Type) + elif extension_name == b"arrow.uuid": + out = UuidType.__new__(UuidType) else: out = BaseExtensionType.__new__(BaseExtensionType) else: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 72ae2aee5f8b3..68f77832c4342 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -17,6 +17,7 @@ import collections from cython cimport binding +from uuid import UUID cdef class Scalar(_Weakrefable): @@ -1043,6 +1044,15 @@ cdef class ExtensionScalar(Scalar): return pyarrow_wrap_scalar( sp_scalar) +class UuidScalar(ExtensionScalar): + """ + Concrete class for Uuid extension scalar. + """ + + def as_py(self): + return None if self.value is None else UUID(bytes=self.value.as_py()) + + cdef class FixedShapeTensorScalar(ExtensionScalar): """ Concrete class for fixed shape tensor extension scalar. diff --git a/python/pyarrow/src/arrow/python/gdb.cc b/python/pyarrow/src/arrow/python/gdb.cc index 6941769e4efe8..7c58bae3342c2 100644 --- a/python/pyarrow/src/arrow/python/gdb.cc +++ b/python/pyarrow/src/arrow/python/gdb.cc @@ -22,7 +22,7 @@ #include "arrow/array.h" #include "arrow/chunked_array.h" #include "arrow/datum.h" -#include "arrow/extension_type.h" +#include "arrow/extension/uuid.h" #include "arrow/ipc/json_simple.h" #include "arrow/python/gdb.h" #include "arrow/record_batch.h" @@ -37,6 +37,8 @@ namespace arrow { +using extension::uuid; +using extension::UuidType; using ipc::internal::json::ArrayFromJSON; using ipc::internal::json::ChunkedArrayFromJSON; using ipc::internal::json::ScalarFromJSON; @@ -56,29 +58,6 @@ class CustomStatusDetail : public StatusDetail { std::string ToString() const override { return "This is a detail"; } }; -class UuidType : public ExtensionType { - public: - UuidType() : ExtensionType(fixed_size_binary(16)) {} - - std::string extension_name() const override { return "uuid"; } - - bool ExtensionEquals(const ExtensionType& other) const override { - return (other.extension_name() == this->extension_name()); - } - - std::shared_ptr MakeArray(std::shared_ptr data) const override { - return std::make_shared(data); - } - - Result> Deserialize( - std::shared_ptr storage_type, - const std::string& serialized) const override { - return Status::NotImplemented(""); - } - - std::string Serialize() const override { return "uuid-serialized"; } -}; - std::shared_ptr SliceArrayFromJSON(const std::shared_ptr& ty, std::string_view json, int64_t offset = 0, int64_t length = -1) { diff --git a/python/pyarrow/tests/extensions.pyx b/python/pyarrow/tests/extensions.pyx index c1bf9aae1ec03..309b574dc0264 100644 --- a/python/pyarrow/tests/extensions.pyx +++ b/python/pyarrow/tests/extensions.pyx @@ -37,7 +37,7 @@ cdef extern from * namespace "arrow::py" nogil: class UuidType : public ExtensionType { public: UuidType() : ExtensionType(fixed_size_binary(16)) {} - std::string extension_name() const override { return "uuid"; } + std::string extension_name() const override { return "example-uuid"; } bool ExtensionEquals(const ExtensionType& other) const override { return other.extension_name() == this->extension_name(); diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 0d50c467e96bd..aacbd2cb6e756 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -95,18 +95,21 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): return cls() -class UuidScalarType(pa.ExtensionScalar): +class ExampleUuidScalarType(pa.ExtensionScalar): def as_py(self): return None if self.value is None else UUID(bytes=self.value.as_py()) -class UuidType(pa.ExtensionType): +class ExampleUuidType(pa.ExtensionType): def __init__(self): - super().__init__(pa.binary(16), 'pyarrow.tests.UuidType') + super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType') + + def __reduce__(self): + return ExampleUuidType, () def __arrow_ext_scalar_class__(self): - return UuidScalarType + return ExampleUuidScalarType def __arrow_ext_serialize__(self): return b'' @@ -116,10 +119,10 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): return cls() -class UuidType2(pa.ExtensionType): +class ExampleUuidType2(pa.ExtensionType): def __init__(self): - super().__init__(pa.binary(16), 'pyarrow.tests.UuidType2') + super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType2') def __arrow_ext_serialize__(self): return b'' @@ -250,8 +253,8 @@ def ipc_read_batch(buf): def test_ext_type_basics(): - ty = UuidType() - assert ty.extension_name == "pyarrow.tests.UuidType" + ty = ExampleUuidType() + assert ty.extension_name == "pyarrow.tests.ExampleUuidType" def test_ext_type_str(): @@ -267,16 +270,16 @@ def test_ext_type_repr(): def test_ext_type_lifetime(): - ty = UuidType() + ty = ExampleUuidType() wr = weakref.ref(ty) del ty assert wr() is None def test_ext_type_storage_type(): - ty = UuidType() + ty = ExampleUuidType() assert ty.storage_type == pa.binary(16) - assert ty.__class__ is UuidType + assert ty.__class__ is ExampleUuidType ty = ParamExtType(5) assert ty.storage_type == pa.binary(5) assert ty.__class__ is ParamExtType @@ -284,7 +287,7 @@ def test_ext_type_storage_type(): def test_ext_type_byte_width(): # Test for fixed-size binary types - ty = UuidType() + ty = pa.uuid() assert ty.byte_width == 16 ty = ParamExtType(5) assert ty.byte_width == 5 @@ -297,7 +300,7 @@ def test_ext_type_byte_width(): def test_ext_type_bit_width(): # Test for fixed-size binary types - ty = UuidType() + ty = pa.uuid() assert ty.bit_width == 128 ty = ParamExtType(5) assert ty.bit_width == 40 @@ -309,7 +312,7 @@ def test_ext_type_bit_width(): def test_ext_type_as_py(): - ty = UuidType() + ty = ExampleUuidType() expected = uuid4() scalar = pa.ExtensionScalar.from_storage(ty, expected.bytes) assert scalar.as_py() == expected @@ -342,12 +345,22 @@ def test_ext_type_as_py(): def test_uuid_type_pickle(pickle_module): for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1): - ty = UuidType() + ty = ExampleUuidType() ser = pickle_module.dumps(ty, protocol=proto) del ty ty = pickle_module.loads(ser) wr = weakref.ref(ty) - assert ty.extension_name == "pyarrow.tests.UuidType" + assert ty.extension_name == "pyarrow.tests.ExampleUuidType" + del ty + assert wr() is None + + for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1): + ty = pa.uuid() + ser = pickle_module.dumps(ty, protocol=proto) + del ty + ty = pickle_module.loads(ser) + wr = weakref.ref(ty) + assert ty.extension_name == "arrow.uuid" del ty assert wr() is None @@ -358,8 +371,8 @@ def test_ext_type_equality(): c = ParamExtType(6) assert a != b assert b == c - d = UuidType() - e = UuidType() + d = ExampleUuidType() + e = ExampleUuidType() assert a != d assert d == e @@ -403,7 +416,7 @@ def test_ext_array_equality(): storage1 = pa.array([b"0123456789abcdef"], type=pa.binary(16)) storage2 = pa.array([b"0123456789abcdef"], type=pa.binary(16)) storage3 = pa.array([], type=pa.binary(16)) - ty1 = UuidType() + ty1 = ExampleUuidType() ty2 = ParamExtType(16) a = pa.ExtensionArray.from_storage(ty1, storage1) @@ -451,9 +464,9 @@ def test_ext_scalar_from_array(): data = [b"0123456789abcdef", b"0123456789abcdef", b"zyxwvutsrqponmlk", None] storage = pa.array(data, type=pa.binary(16)) - ty1 = UuidType() + ty1 = ExampleUuidType() ty2 = ParamExtType(16) - ty3 = UuidType2() + ty3 = ExampleUuidType2() a = pa.ExtensionArray.from_storage(ty1, storage) b = pa.ExtensionArray.from_storage(ty2, storage) @@ -462,9 +475,9 @@ def test_ext_scalar_from_array(): scalars_a = list(a) assert len(scalars_a) == 4 - assert ty1.__arrow_ext_scalar_class__() == UuidScalarType - assert isinstance(a[0], UuidScalarType) - assert isinstance(scalars_a[0], UuidScalarType) + assert ty1.__arrow_ext_scalar_class__() == ExampleUuidScalarType + assert isinstance(a[0], ExampleUuidScalarType) + assert isinstance(scalars_a[0], ExampleUuidScalarType) for s, val in zip(scalars_a, data): assert isinstance(s, pa.ExtensionScalar) @@ -505,7 +518,7 @@ def test_ext_scalar_from_array(): def test_ext_scalar_from_storage(): - ty = UuidType() + ty = ExampleUuidType() s = pa.ExtensionScalar.from_storage(ty, None) assert isinstance(s, pa.ExtensionScalar) @@ -706,14 +719,14 @@ def test_cast_between_extension_types(): tiny_int_arr.cast(pa.int64()).cast(IntegerType()) # Between the same extension types is okay - array = pa.array([b'1' * 16, b'2' * 16], pa.binary(16)).cast(UuidType()) - out = array.cast(UuidType()) - assert out.type == UuidType() + array = pa.array([b'1' * 16, b'2' * 16], pa.binary(16)).cast(ExampleUuidType()) + out = array.cast(ExampleUuidType()) + assert out.type == ExampleUuidType() # Will still fail casting between extensions who share storage type, # can only cast between exactly the same extension types. with pytest.raises(TypeError, match='Casting from *'): - array.cast(UuidType2()) + array.cast(ExampleUuidType2()) def test_cast_to_extension_with_extension_storage(): @@ -744,10 +757,10 @@ def test_cast_nested_extension_types(data, type_factory): def test_casting_dict_array_to_extension_type(): storage = pa.array([b"0123456789abcdef"], type=pa.binary(16)) - arr = pa.ExtensionArray.from_storage(UuidType(), storage) + arr = pa.ExtensionArray.from_storage(ExampleUuidType(), storage) dict_arr = pa.DictionaryArray.from_arrays(pa.array([0, 0], pa.int32()), arr) - out = dict_arr.cast(UuidType()) + out = dict_arr.cast(ExampleUuidType()) assert isinstance(out, pa.ExtensionArray) assert out.to_pylist() == [UUID('30313233-3435-3637-3839-616263646566'), UUID('30313233-3435-3637-3839-616263646566')] @@ -1347,7 +1360,7 @@ def test_cpp_extension_in_python(tmpdir): mod = __import__('extensions') uuid_type = mod._make_uuid_type() - assert uuid_type.extension_name == "uuid" + assert uuid_type.extension_name == "example-uuid" assert uuid_type.storage_type == pa.binary(16) array = mod._make_uuid_array() @@ -1356,6 +1369,31 @@ def test_cpp_extension_in_python(tmpdir): assert array[0].as_py() == b'abcdefghijklmno0' assert array[1].as_py() == b'0onmlkjihgfedcba' + buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["example-uuid"])) + + batch = ipc_read_batch(buf) + reconstructed_array = batch.column(0) + assert reconstructed_array.type == uuid_type + assert reconstructed_array == array + + +def test_uuid_extension(): + data = [b"0123456789abcdef", b"0123456789abcdef", + b"zyxwvutsrqponmlk", None] + + uuid_type = pa.uuid() + assert uuid_type.extension_name == "arrow.uuid" + assert uuid_type.storage_type == pa.binary(16) + assert uuid_type.__class__ is pa.UuidType + + storage = pa.array(data, pa.binary(16)) + array = pa.ExtensionArray.from_storage(uuid_type, storage) + assert array.type == uuid_type + + assert array.to_pylist() == [x if x is None else UUID(bytes=x) for x in data] + assert array[0].as_py() == UUID(bytes=data[0]) + assert array[3].as_py() is None + buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["uuid"])) batch = ipc_read_batch(buf) @@ -1363,6 +1401,9 @@ def test_cpp_extension_in_python(tmpdir): assert reconstructed_array.type == uuid_type assert reconstructed_array == array + assert uuid_type.__arrow_ext_scalar_class__() == pa.UuidScalar + assert isinstance(array[0], pa.UuidScalar) + def test_tensor_type(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py index 0d12d710dcf64..2ac2f55754fe5 100644 --- a/python/pyarrow/tests/test_gdb.py +++ b/python/pyarrow/tests/test_gdb.py @@ -409,7 +409,7 @@ def test_types_stack(gdb_arrow): check_stack_repr( gdb_arrow, "uuid_type", - ('arrow::ExtensionType "extension" ' + ('arrow::ExtensionType "extension" ' 'with storage type arrow::fixed_size_binary(16)')) @@ -447,7 +447,7 @@ def test_types_heap(gdb_arrow): check_heap_repr( gdb_arrow, "heap_uuid_type", - ('arrow::ExtensionType "extension" ' + ('arrow::ExtensionType "extension" ' 'with storage type arrow::fixed_size_binary(16)')) @@ -716,12 +716,12 @@ def test_scalars_stack(gdb_arrow): check_stack_repr( gdb_arrow, "extension_scalar", - ('arrow::ExtensionScalar of type "extension", ' + ('arrow::ExtensionScalar of type "extension", ' 'value arrow::FixedSizeBinaryScalar of size 16, ' 'value "0123456789abcdef"')) check_stack_repr( gdb_arrow, "extension_scalar_null", - 'arrow::ExtensionScalar of type "extension", null value') + 'arrow::ExtensionScalar of type "extension", null value') def test_scalars_heap(gdb_arrow): diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 64a984731c1c2..5f0561ecfcecc 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1785,6 +1785,25 @@ cdef class ExtensionType(BaseExtensionType): return ExtensionScalar +cdef class UuidType(BaseExtensionType): + """ + Concrete class for UUID extension type. + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.uuid_ext_type = type.get() + + def __arrow_ext_class__(self): + return UuidArray + + def __reduce__(self): + return uuid, () + + def __arrow_ext_scalar_class__(self): + return UuidScalar + + cdef class FixedShapeTensorType(BaseExtensionType): """ Concrete class for fixed shape tensor extension type. @@ -5246,6 +5265,21 @@ def run_end_encoded(run_end_type, value_type): return pyarrow_wrap_data_type(ree_type) +def uuid(): + """ + Create UuidType instance. + + Returns + ------- + type : UuidType + """ + + cdef UuidType out = UuidType.__new__(UuidType) + c_uuid_ext_type = GetResultValue(CUuidType.Make()) + out.init(c_uuid_ext_type) + return out + + def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=None): """ Create instance of fixed shape tensor extension type with shape and optional From 95bce2e2c757ada51e01eac6edfec5e98ce0158f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 27 Aug 2024 09:12:57 +0900 Subject: [PATCH 018/130] MINOR: [Go] Bump github.com/hamba/avro/v2 from 2.24.1 to 2.25.0 in /go (#43829) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [github.com/hamba/avro/v2](https://github.com/hamba/avro) from 2.24.1 to 2.25.0.
Release notes

Sourced from github.com/hamba/avro/v2's releases.

v2.25.0

What's Changed

New Contributors

Full Changelog: https://github.com/hamba/avro/compare/v2.24.1...v2.24.2

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/hamba/avro/v2&package-manager=go_modules&previous-version=2.24.1&new-version=2.25.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 9f4222a541bb6..97ac05685970c 100644 --- a/go/go.mod +++ b/go/go.mod @@ -47,7 +47,7 @@ require ( require ( github.com/google/uuid v1.6.0 - github.com/hamba/avro/v2 v2.24.1 + github.com/hamba/avro/v2 v2.25.0 github.com/huandu/xstrings v1.4.0 github.com/substrait-io/substrait-go v0.6.0 github.com/tidwall/sjson v1.2.5 diff --git a/go/go.sum b/go/go.sum index c7eb3a66deeec..bd761e1589453 100644 --- a/go/go.sum +++ b/go/go.sum @@ -43,8 +43,8 @@ github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbu github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/hamba/avro/v2 v2.24.1 h1:Xi+7AnhaAc41aA/jmmYpxMsdEDOf1rdup6NJ85P7q2I= -github.com/hamba/avro/v2 v2.24.1/go.mod h1:7vDfy/2+kYCE8WUHoj2et59GTv0ap7ptktMXu0QHePI= +github.com/hamba/avro/v2 v2.25.0 h1:9qig/K4VP5tMq6DuKGfI6YdXncTkPJT1IJDMSv82EeI= +github.com/hamba/avro/v2 v2.25.0/go.mod h1:I8glyswHnpED3Nlx2ZdUe+4LJnCOOyiCzLMno9i/Uu0= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= From e8912c98ccec02df6ca387d7c4d2dcf6a285aa05 Mon Sep 17 00:00:00 2001 From: PANKAJ9768 <48675737+PANKAJ9768@users.noreply.github.com> Date: Tue, 27 Aug 2024 05:59:09 +0530 Subject: [PATCH 019/130] GH-43667: [Java] Keeping Flight default header size consistent between server and client (#43697) ### Rationale for this change ### What changes are included in this PR? Flight client can send header size larger than server can accept. This PR is to keep default values consistent across server and client. ### Are these changes tested? ### Are there any user-facing changes? * GitHub Issue: #43667 Authored-by: pankaj kesari Signed-off-by: David Li --- .../org/apache/arrow/flight/FlightServer.java | 7 ++ .../arrow/flight/TestFlightService.java | 73 +++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java index 05dbe42c49172..ac761457f57fd 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java @@ -188,6 +188,7 @@ public static final class Builder { private CallHeaderAuthenticator headerAuthenticator = CallHeaderAuthenticator.NO_OP; private ExecutorService executor = null; private int maxInboundMessageSize = MAX_GRPC_MESSAGE_SIZE; + private int maxHeaderListSize = MAX_GRPC_MESSAGE_SIZE; private int backpressureThreshold = DEFAULT_BACKPRESSURE_THRESHOLD; private InputStream certChain; private InputStream key; @@ -324,6 +325,7 @@ public FlightServer build() { builder .executor(exec) .maxInboundMessageSize(maxInboundMessageSize) + .maxInboundMetadataSize(maxHeaderListSize) .addService( ServerInterceptors.intercept( flightService, @@ -366,6 +368,11 @@ public FlightServer build() { return new FlightServer(location, builder.build(), grpcExecutor); } + public Builder setMaxHeaderListSize(int maxHeaderListSize) { + this.maxHeaderListSize = maxHeaderListSize; + return this; + } + /** * Set the maximum size of a message. Defaults to "unlimited", depending on the underlying * transport. diff --git a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java index 5ebeb44c1d36e..fc3f83e4eafd3 100644 --- a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java +++ b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java @@ -27,6 +27,7 @@ import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.Optional; +import java.util.Random; import org.apache.arrow.flight.impl.Flight; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; @@ -152,4 +153,76 @@ public FlightInfo getFlightInfo(CallContext context, FlightDescriptor descriptor assertEquals("No schema is present in FlightInfo", e.getMessage()); } } + + /** + * Test for GH-41584 where flight defaults for header size was not in sync b\w client and server. + */ + @Test + public void testHeaderSizeExchangeInService() throws Exception { + final FlightProducer producer = + new NoOpFlightProducer() { + @Override + public FlightInfo getFlightInfo(CallContext context, FlightDescriptor descriptor) { + String longHeader = + context.getMiddleware(FlightConstants.HEADER_KEY).headers().get("long-header"); + return new FlightInfo( + null, + descriptor, + Collections.emptyList(), + 0, + 0, + false, + IpcOption.DEFAULT, + longHeader.getBytes(StandardCharsets.UTF_8)); + } + }; + + String headerVal = generateRandom(1024 * 10); + FlightCallHeaders callHeaders = new FlightCallHeaders(); + callHeaders.insert("long-header", headerVal); + // sever with default header limit same as client + try (final FlightServer s = + FlightServer.builder(allocator, forGrpcInsecure(LOCALHOST, 0), producer) + .build() + .start(); + final FlightClient client = FlightClient.builder(allocator, s.getLocation()).build()) { + FlightInfo flightInfo = + client.getInfo(FlightDescriptor.path("test"), new HeaderCallOption(callHeaders)); + assertEquals(Optional.empty(), flightInfo.getSchemaOptional()); + assertEquals(new Schema(Collections.emptyList()), flightInfo.getSchema()); + assertArrayEquals(flightInfo.getAppMetadata(), headerVal.getBytes(StandardCharsets.UTF_8)); + } + // server with 15kb header limit + try (final FlightServer s = + FlightServer.builder(allocator, forGrpcInsecure(LOCALHOST, 0), producer) + .setMaxHeaderListSize(1024 * 15) + .build() + .start(); + final FlightClient client = FlightClient.builder(allocator, s.getLocation()).build()) { + FlightInfo flightInfo = + client.getInfo(FlightDescriptor.path("test"), new HeaderCallOption(callHeaders)); + assertEquals(Optional.empty(), flightInfo.getSchemaOptional()); + assertEquals(new Schema(Collections.emptyList()), flightInfo.getSchema()); + assertArrayEquals(flightInfo.getAppMetadata(), headerVal.getBytes(StandardCharsets.UTF_8)); + + callHeaders.insert("another-header", headerVal + headerVal); + FlightRuntimeException e = + assertThrows( + FlightRuntimeException.class, + () -> + client.getInfo(FlightDescriptor.path("test"), new HeaderCallOption(callHeaders))); + assertEquals("http2 exception", e.getMessage()); + } + } + + private static String generateRandom(int size) { + String aToZ = "ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"; + Random random = new Random(); + StringBuilder res = new StringBuilder(); + for (int i = 0; i < size; i++) { + int randIndex = random.nextInt(aToZ.length()); + res.append(aToZ.charAt(randIndex)); + } + return res.toString(); + } } From aa8950f68d24584c85b7434ad0d9a3e3da52ba90 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 27 Aug 2024 11:06:13 +0900 Subject: [PATCH 020/130] MINOR: [Go] Bump github.com/substrait-io/substrait-go from 0.6.0 to 0.7.0 in /go (#43830) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [github.com/substrait-io/substrait-go](https://github.com/substrait-io/substrait-go) from 0.6.0 to 0.7.0.
Release notes

Sourced from github.com/substrait-io/substrait-go's releases.

v0.7.0 (2024-08-25)

Features

  • Add convenience literal APIs (#47) (597afdb)
    • Introduce literal package

Changes to the build process or auxiliary tools and libraries such as documentation generation

  • extensions Minor refactoring in extension_mgr.go (#45) (cbd28cb)
    • Minor refactoring in extension_mgr.go
  • Move typeName maps to types package (#46) (5556c23)
Commits
  • 597afdb feat: Add convenience literal APIs (#47)
  • e77df67 feat(types) Make time precision value explicit (#49)
  • a3e8ee0 feat(substrait) Update to substrait v0.55.0 (#48)
  • 2229c12 ci(build-test): golangci should use the go.mod version of golang (#51)
  • cbd28cb chore(extensions): Minor refactoring in extension_mgr.go (#45)
  • 5556c23 chore: Move typeName maps to types package (#46)
  • dd790cb Add a function registry for a given BFT dialect (#32)
  • 828636c ci(build-test): Add golangci-lint to do import checking and other linting (#42)
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/substrait-io/substrait-go&package-manager=go_modules&previous-version=0.6.0&new-version=0.7.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 97ac05685970c..a995eee24d563 100644 --- a/go/go.mod +++ b/go/go.mod @@ -49,7 +49,7 @@ require ( github.com/google/uuid v1.6.0 github.com/hamba/avro/v2 v2.25.0 github.com/huandu/xstrings v1.4.0 - github.com/substrait-io/substrait-go v0.6.0 + github.com/substrait-io/substrait-go v0.7.0 github.com/tidwall/sjson v1.2.5 ) diff --git a/go/go.sum b/go/go.sum index bd761e1589453..6f22e11aef03a 100644 --- a/go/go.sum +++ b/go/go.sum @@ -99,8 +99,8 @@ github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/substrait-io/substrait-go v0.6.0 h1:n2G/SGmrn7U5Q39VA8WeM2UfVL5Y/6HX8WAP9uJLNk4= -github.com/substrait-io/substrait-go v0.6.0/go.mod h1:cl8Wsc7aBPDfcHp9+OrUqGpjkgrYlhcDsH/lMP6KUZA= +github.com/substrait-io/substrait-go v0.7.0 h1:53yi73t4wW383+RD1YuhXhbjhP1KzF9GCxPC7SsRlqc= +github.com/substrait-io/substrait-go v0.7.0/go.mod h1:7mjSvIaxk94bOF+YZn/vBOpHK4DWTpBv7nC/btjXCmc= github.com/tidwall/gjson v1.14.2 h1:6BBkirS0rAHjumnjHF6qgy5d2YAJ1TLIaFE2lzfOLqo= github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= From db0029fb1d2af5a72744bdd971505ba025299a0e Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 27 Aug 2024 11:44:19 +0900 Subject: [PATCH 021/130] MINOR: [Java] Downgrade gRPC to 1.65 (#43839) ### Rationale for this change Newer versions don't run in all CI pipelines due to protoc using a newer glibc. ### What changes are included in this PR? This reverts commit 4af1e491df7ac22217656668b65c3e8d55f5b5ab. ### Are these changes tested? N/A ### Are there any user-facing changes? No Authored-by: David Li Signed-off-by: David Li --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 77feed12f3f1d..f78d02c0c650f 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -97,7 +97,7 @@ under the License. 2.0.16 33.2.1-jre 4.1.112.Final - 1.66.0 + 1.65.0 3.25.4 2.17.2 3.4.0 From 5b981251e8959236a8b89f597b2ae6fe2c850372 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 27 Aug 2024 11:56:45 +0900 Subject: [PATCH 022/130] MINOR: [Java] Bump org.apache.commons:commons-compress from 1.27.0 to 1.27.1 in /java (#43826) Bumps org.apache.commons:commons-compress from 1.27.0 to 1.27.1. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.commons:commons-compress&package-manager=maven&previous-version=1.27.0&new-version=1.27.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/compression/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/compression/pom.xml b/java/compression/pom.xml index a1f2bc861da1f..46ed8796423eb 100644 --- a/java/compression/pom.xml +++ b/java/compression/pom.xml @@ -50,7 +50,7 @@ under the License. org.apache.commons commons-compress - 1.27.0 + 1.27.1 com.github.luben From ca4c756a86f8feb67ef1a24a172fc31a9224df5a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 21:01:45 -0700 Subject: [PATCH 023/130] MINOR: [C#] Bump Microsoft.NET.Test.Sdk from 17.10.0 to 17.11.0 in /csharp (#43822) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [Microsoft.NET.Test.Sdk](https://github.com/microsoft/vstest) from 17.10.0 to 17.11.0.
Release notes

Sourced from Microsoft.NET.Test.Sdk's releases.

v17.11.0

What's Changed

New Contributors

Full Changelog: https://github.com/microsoft/vstest/compare/v17.10.0...v17.11.0-release-24352-06

v17.11.0-release-24373-02

What's Changed

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Microsoft.NET.Test.Sdk&package-manager=nuget&previous-version=17.10.0&new-version=17.11.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 047cdb94b963e..4ea02e0ed21c0 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index dc95f9edf9f7f..fd8274230ec64 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -6,7 +6,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index e68a97670cc7e..eae9ab746f283 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -6,7 +6,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index f05338313063c..ee71b203218f8 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -16,7 +16,7 @@ - + all From 909ae175f6c9b255dc6582f45f29efa2c57281eb Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 27 Aug 2024 14:49:45 +0900 Subject: [PATCH 024/130] GH-41056: [GLib][FlightRPC] Add gaflight_client_do_put() and related APIs (#43813) ### Rationale for this change DoPut is needed to upload data. ### What changes are included in this PR? * Add `gaflight_client_do_put()` * Add `GAFlightStreamWriter` * Add `GAFlightMetadataReader` * Add `GAFlightDoPutResult` * Fix `GAFlightRecordBatchWriter` API ### Are these changes tested? No. They aren't tested yet. We will add tests when we implement server side DoPut. ### Are there any user-facing changes? Yes. * GitHub Issue: #41056 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/arrow-flight-glib/client.cpp | 337 +++++++++++++++++++++++++++- c_glib/arrow-flight-glib/client.h | 46 ++++ c_glib/arrow-flight-glib/client.hpp | 16 ++ c_glib/arrow-flight-glib/common.cpp | 102 ++------- c_glib/arrow-flight-glib/common.h | 8 +- c_glib/arrow-glib/writer.hpp | 4 + 6 files changed, 421 insertions(+), 92 deletions(-) diff --git a/c_glib/arrow-flight-glib/client.cpp b/c_glib/arrow-flight-glib/client.cpp index 80c47e336f872..23f59c9da69ad 100644 --- a/c_glib/arrow-flight-glib/client.cpp +++ b/c_glib/arrow-flight-glib/client.cpp @@ -33,10 +33,19 @@ G_BEGIN_DECLS * #GAFlightStreamReader is a class for reading record batches from a * server. * + * #GAFlightStreamWriter is a class for writing record batches to a + * server. + * + * #GAFlightMetadataReader is a class for reading metadata from a + * server. + * * #GAFlightCallOptions is a class for options of each call. * * #GAFlightClientOptions is a class for options of each client. * + * #GAFlightDoPutResult is a class that has gaflight_client_do_put() + * result. + * * #GAFlightClient is a class for Apache Arrow Flight client. * * Since: 5.0.0 @@ -56,6 +65,128 @@ gaflight_stream_reader_class_init(GAFlightStreamReaderClass *klass) { } +G_DEFINE_TYPE(GAFlightStreamWriter, + gaflight_stream_writer, + GAFLIGHT_TYPE_RECORD_BATCH_WRITER) + +static void +gaflight_stream_writer_init(GAFlightStreamWriter *object) +{ +} + +static void +gaflight_stream_writer_class_init(GAFlightStreamWriterClass *klass) +{ +} + +/** + * gaflight_stream_writer_done_writing: + * @writer: A #GAFlightStreamWriter. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +gaflight_stream_writer_done_writing(GAFlightStreamWriter *writer, GError **error) +{ + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); + return garrow::check(error, + flight_writer->DoneWriting(), + "[flight-stream-writer][done-writing]"); +} + +struct GAFlightMetadataReaderPrivate +{ + arrow::flight::FlightMetadataReader *reader; +}; + +enum { + PROP_METADATA_READER_READER = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightMetadataReader, + gaflight_metadata_reader, + G_TYPE_OBJECT) + +#define GAFLIGHT_METADATA_READER_GET_PRIVATE(object) \ + static_cast( \ + gaflight_metadata_reader_get_instance_private(GAFLIGHT_METADATA_READER(object))) + +static void +gaflight_metadata_reader_finalize(GObject *object) +{ + auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(object); + delete priv->reader; + G_OBJECT_CLASS(gaflight_metadata_reader_parent_class)->finalize(object); +} + +static void +gaflight_metadata_reader_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_METADATA_READER_READER: + priv->reader = + static_cast(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_metadata_reader_init(GAFlightMetadataReader *object) +{ +} + +static void +gaflight_metadata_reader_class_init(GAFlightMetadataReaderClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_metadata_reader_finalize; + gobject_class->set_property = gaflight_metadata_reader_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer( + "reader", + nullptr, + nullptr, + static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_METADATA_READER_READER, spec); +} + +/** + * gaflight_metadata_reader_read: + * @reader: A #GAFlightMetadataReader. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): The metadata on success, %NULL on error. + * + * Since: 18.0.0 + */ +GArrowBuffer * +gaflight_metadata_reader_read(GAFlightMetadataReader *reader, GError **error) +{ + auto flight_reader = gaflight_metadata_reader_get_raw(reader); + std::shared_ptr metadata; + if (garrow::check(error, + flight_reader->ReadMetadata(&metadata), + "[flight-metadata-reader][read]")) { + return garrow_buffer_new_raw(&metadata); + } else { + return nullptr; + } +} + typedef struct GAFlightCallOptionsPrivate_ { arrow::flight::FlightCallOptions options; @@ -385,6 +516,137 @@ gaflight_client_options_new(void) g_object_new(GAFLIGHT_TYPE_CLIENT_OPTIONS, NULL)); } +struct GAFlightDoPutResultPrivate +{ + GAFlightStreamWriter *writer; + GAFlightMetadataReader *reader; +}; + +enum { + PROP_DO_PUT_RESULT_RESULT = 1, + PROP_DO_PUT_RESULT_WRITER, + PROP_DO_PUT_RESULT_READER, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightDoPutResult, gaflight_do_put_result, G_TYPE_OBJECT) + +#define GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object) \ + static_cast( \ + gaflight_do_put_result_get_instance_private(GAFLIGHT_DO_PUT_RESULT(object))) + +static void +gaflight_do_put_result_dispose(GObject *object) +{ + auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object); + + if (priv->writer) { + g_object_unref(priv->writer); + priv->writer = nullptr; + } + + if (priv->reader) { + g_object_unref(priv->reader); + priv->reader = nullptr; + } + + G_OBJECT_CLASS(gaflight_do_put_result_parent_class)->dispose(object); +} + +static void +gaflight_do_put_result_init(GAFlightDoPutResult *object) +{ +} + +static void +gaflight_do_put_result_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DO_PUT_RESULT_RESULT: + { + auto result = static_cast( + g_value_get_pointer(value)); + priv->writer = gaflight_stream_writer_new_raw(result->writer.release()); + priv->reader = gaflight_metadata_reader_new_raw(result->reader.release()); + break; + } + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_do_put_result_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DO_PUT_RESULT_WRITER: + g_value_set_object(value, priv->writer); + break; + case PROP_DO_PUT_RESULT_READER: + g_value_set_object(value, priv->reader); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_do_put_result_class_init(GAFlightDoPutResultClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gaflight_do_put_result_dispose; + gobject_class->set_property = gaflight_do_put_result_set_property; + gobject_class->get_property = gaflight_do_put_result_get_property; + + GParamSpec *spec; + spec = g_param_spec_pointer( + "result", + nullptr, + nullptr, + static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_RESULT, spec); + + /** + * GAFlightDoPutResult:writer: + * + * A writer to write record batches to. + * + * Since: 18.0.0 + */ + spec = g_param_spec_object("writer", + nullptr, + nullptr, + GAFLIGHT_TYPE_STREAM_WRITER, + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_WRITER, spec); + + /** + * GAFlightDoPutResult:reader: + * + * A reader for application metadata from the server. + * + * Since: 18.0.0 + */ + spec = g_param_spec_object("reader", + nullptr, + nullptr, + GAFLIGHT_TYPE_METADATA_READER, + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_READER, spec); +} + struct GAFlightClientPrivate { std::shared_ptr client; @@ -661,6 +923,51 @@ gaflight_client_do_get(GAFlightClient *client, return gaflight_stream_reader_new_raw(flight_reader.release(), TRUE); } +/** + * gaflight_client_do_put: + * @client: A #GAFlightClient. + * @descriptor: A #GAFlightDescriptor. + * @schema: A #GArrowSchema. + * @options: (nullable): A #GAFlightCallOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Upload data to a Flight described by the given descriptor. The + * caller must call garrow_record_batch_writer_close() on the + * returned stream once they are done writing. + * + * The reader and writer are linked; closing the writer will also + * close the reader. Use garrow_flight_stream_writer_done_writing() to + * only close the write side of the channel. + * + * Returns: (nullable) (transfer full): + * The #GAFlighDoPutResult holding a reader and a writer on success, + * %NULL on error. + * + * Since: 18.0.0 + */ +GAFlightDoPutResult * +gaflight_client_do_put(GAFlightClient *client, + GAFlightDescriptor *descriptor, + GArrowSchema *schema, + GAFlightCallOptions *options, + GError **error) +{ + auto flight_client = gaflight_client_get_raw(client); + auto flight_descriptor = gaflight_descriptor_get_raw(descriptor); + auto arrow_schema = garrow_schema_get_raw(schema); + arrow::flight::FlightCallOptions flight_default_options; + auto flight_options = &flight_default_options; + if (options) { + flight_options = gaflight_call_options_get_raw(options); + } + auto result = flight_client->DoPut(*flight_options, *flight_descriptor, arrow_schema); + if (!garrow::check(error, result, "[flight-client][do-put]")) { + return nullptr; + } + auto flight_result = std::move(*result); + return gaflight_do_put_result_new_raw(&flight_result); +} + G_END_DECLS GAFlightStreamReader * @@ -672,7 +979,28 @@ gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader, flight_reader, "is-owner", is_owner, - NULL)); + nullptr)); +} + +GAFlightStreamWriter * +gaflight_stream_writer_new_raw(arrow::flight::FlightStreamWriter *flight_writer) +{ + return GAFLIGHT_STREAM_WRITER( + g_object_new(GAFLIGHT_TYPE_STREAM_WRITER, "writer", flight_writer, nullptr)); +} + +GAFlightMetadataReader * +gaflight_metadata_reader_new_raw(arrow::flight::FlightMetadataReader *flight_reader) +{ + return GAFLIGHT_METADATA_READER( + g_object_new(GAFLIGHT_TYPE_METADATA_READER, "reader", flight_reader, nullptr)); +} + +arrow::flight::FlightMetadataReader * +gaflight_metadata_reader_get_raw(GAFlightMetadataReader *reader) +{ + auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(reader); + return priv->reader; } arrow::flight::FlightCallOptions * @@ -689,6 +1017,13 @@ gaflight_client_options_get_raw(GAFlightClientOptions *options) return &(priv->options); } +GAFlightDoPutResult * +gaflight_do_put_result_new_raw(arrow::flight::FlightClient::DoPutResult *flight_result) +{ + return GAFLIGHT_DO_PUT_RESULT( + g_object_new(GAFLIGHT_TYPE_DO_PUT_RESULT, "result", flight_result, nullptr)); +} + std::shared_ptr gaflight_client_get_raw(GAFlightClient *client) { diff --git a/c_glib/arrow-flight-glib/client.h b/c_glib/arrow-flight-glib/client.h index a91bbe55e3c04..12c5a06b810e1 100644 --- a/c_glib/arrow-flight-glib/client.h +++ b/c_glib/arrow-flight-glib/client.h @@ -35,6 +35,35 @@ struct _GAFlightStreamReaderClass GAFlightRecordBatchReaderClass parent_class; }; +#define GAFLIGHT_TYPE_STREAM_WRITER (gaflight_stream_writer_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE(GAFlightStreamWriter, + gaflight_stream_writer, + GAFLIGHT, + STREAM_WRITER, + GAFlightRecordBatchWriter) +struct _GAFlightStreamWriterClass +{ + GAFlightRecordBatchWriterClass parent_class; +}; + +GAFLIGHT_AVAILABLE_IN_18_0 +gboolean +gaflight_stream_writer_done_writing(GAFlightStreamWriter *writer, GError **error); + +#define GAFLIGHT_TYPE_METADATA_READER (gaflight_metadata_reader_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE( + GAFlightMetadataReader, gaflight_metadata_reader, GAFLIGHT, METADATA_READER, GObject) +struct _GAFlightMetadataReaderClass +{ + GObjectClass parent_class; +}; + +GAFLIGHT_AVAILABLE_IN_18_0 +GArrowBuffer * +gaflight_metadata_reader_read(GAFlightMetadataReader *reader, GError **error); + #define GAFLIGHT_TYPE_CALL_OPTIONS (gaflight_call_options_get_type()) GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( @@ -75,6 +104,15 @@ GAFLIGHT_AVAILABLE_IN_5_0 GAFlightClientOptions * gaflight_client_options_new(void); +#define GAFLIGHT_TYPE_DO_PUT_RESULT (gaflight_do_put_result_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE( + GAFlightDoPutResult, gaflight_do_put_result, GAFLIGHT, DO_PUT_RESULT, GObject) +struct _GAFlightDoPutResultClass +{ + GObjectClass parent_class; +}; + #define GAFLIGHT_TYPE_CLIENT (gaflight_client_get_type()) GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightClient, gaflight_client, GAFLIGHT, CLIENT, GObject) @@ -124,4 +162,12 @@ gaflight_client_do_get(GAFlightClient *client, GAFlightCallOptions *options, GError **error); +GAFLIGHT_AVAILABLE_IN_18_0 +GAFlightDoPutResult * +gaflight_client_do_put(GAFlightClient *client, + GAFlightDescriptor *descriptor, + GArrowSchema *schema, + GAFlightCallOptions *options, + GError **error); + G_END_DECLS diff --git a/c_glib/arrow-flight-glib/client.hpp b/c_glib/arrow-flight-glib/client.hpp index 185a28e6dc4bd..888f87ecb5732 100644 --- a/c_glib/arrow-flight-glib/client.hpp +++ b/c_glib/arrow-flight-glib/client.hpp @@ -28,6 +28,18 @@ GAFlightStreamReader * gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader, gboolean is_owner); +GAFLIGHT_EXTERN +GAFlightStreamWriter * +gaflight_stream_writer_new_raw(arrow::flight::FlightStreamWriter *flight_writer); + +GAFLIGHT_EXTERN +GAFlightMetadataReader * +gaflight_metadata_reader_new_raw(arrow::flight::FlightMetadataReader *flight_reader); + +GAFLIGHT_EXTERN +arrow::flight::FlightMetadataReader * +gaflight_metadata_reader_get_raw(GAFlightMetadataReader *reader); + GAFLIGHT_EXTERN arrow::flight::FlightCallOptions * gaflight_call_options_get_raw(GAFlightCallOptions *options); @@ -36,6 +48,10 @@ GAFLIGHT_EXTERN arrow::flight::FlightClientOptions * gaflight_client_options_get_raw(GAFlightClientOptions *options); +GAFLIGHT_EXTERN +GAFlightDoPutResult * +gaflight_do_put_result_new_raw(arrow::flight::FlightClient::DoPutResult *flight_result); + GAFLIGHT_EXTERN std::shared_ptr gaflight_client_get_raw(GAFlightClient *client); diff --git a/c_glib/arrow-flight-glib/common.cpp b/c_glib/arrow-flight-glib/common.cpp index f7eea08c264b3..3deaf67cc14e8 100644 --- a/c_glib/arrow-flight-glib/common.cpp +++ b/c_glib/arrow-flight-glib/common.cpp @@ -1196,7 +1196,7 @@ gaflight_record_batch_reader_finalize(GObject *object) if (priv->is_owner) { delete priv->reader; } - G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object); + G_OBJECT_CLASS(gaflight_record_batch_reader_parent_class)->finalize(object); } static void @@ -1300,57 +1300,9 @@ gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, GError } } -typedef struct GAFlightRecordBatchWriterPrivate_ -{ - arrow::flight::MetadataRecordBatchWriter *writer; - bool is_owner; -} GAFlightRecordBatchWriterPrivate; - -enum { - PROP_RECORD_BATCH_WRITER_WRITER = 1, - PROP_RECORD_BATCH_WRITER_IS_OWNER, -}; - -G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GAFlightRecordBatchWriter, - gaflight_record_batch_writer, - GARROW_TYPE_RECORD_BATCH_WRITER) - -#define GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object) \ - static_cast( \ - gaflight_record_batch_writer_get_instance_private( \ - GAFLIGHT_RECORD_BATCH_WRITER(object))) - -static void -gaflight_record_batch_writer_finalize(GObject *object) -{ - auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object); - if (priv->is_owner) { - delete priv->writer; - } - G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object); -} - -static void -gaflight_record_batch_writer_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) -{ - auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_RECORD_BATCH_WRITER_WRITER: - priv->writer = - static_cast(g_value_get_pointer(value)); - break; - case PROP_RECORD_BATCH_WRITER_IS_OWNER: - priv->is_owner = g_value_get_boolean(value); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} +G_DEFINE_ABSTRACT_TYPE(GAFlightRecordBatchWriter, + gaflight_record_batch_writer, + GARROW_TYPE_RECORD_BATCH_WRITER) static void gaflight_record_batch_writer_init(GAFlightRecordBatchWriter *object) @@ -1360,26 +1312,6 @@ gaflight_record_batch_writer_init(GAFlightRecordBatchWriter *object) static void gaflight_record_batch_writer_class_init(GAFlightRecordBatchWriterClass *klass) { - auto gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->finalize = gaflight_record_batch_writer_finalize; - gobject_class->set_property = gaflight_record_batch_writer_set_property; - - GParamSpec *spec; - spec = g_param_spec_pointer( - "writer", - nullptr, - nullptr, - static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_WRITER, spec); - - spec = g_param_spec_boolean( - "is-owner", - nullptr, - nullptr, - TRUE, - static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_IS_OWNER, spec); } /** @@ -1402,7 +1334,8 @@ gaflight_record_batch_writer_begin(GAFlightRecordBatchWriter *writer, GArrowWriteOptions *options, GError **error) { - auto flight_writer = gaflight_record_batch_writer_get_raw(writer); + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); auto arrow_schema = garrow_schema_get_raw(schema); arrow::ipc::IpcWriteOptions arrow_write_options; if (options) { @@ -1432,7 +1365,8 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer, GArrowBuffer *metadata, GError **error) { - auto flight_writer = gaflight_record_batch_writer_get_raw(writer); + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); auto arrow_metadata = garrow_buffer_get_raw(metadata); return garrow::check(error, flight_writer->WriteMetadata(arrow_metadata), @@ -1440,7 +1374,7 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer, } /** - * gaflight_record_batch_writer_write: + * gaflight_record_batch_writer_write_record_batch: * @writer: A #GAFlightRecordBatchWriter. * @record_batch: A #GArrowRecordBatch. * @metadata: (nullable): A #GArrowBuffer. @@ -1453,12 +1387,13 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer, * Since: 18.0.0 */ gboolean -gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer, - GArrowRecordBatch *record_batch, - GArrowBuffer *metadata, - GError **error) +gaflight_record_batch_writer_write_record_batch(GAFlightRecordBatchWriter *writer, + GArrowRecordBatch *record_batch, + GArrowBuffer *metadata, + GError **error) { - auto flight_writer = gaflight_record_batch_writer_get_raw(writer); + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); auto arrow_metadata = garrow_buffer_get_raw(metadata); return garrow::check( @@ -1599,10 +1534,3 @@ gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader) auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(reader); return priv->reader; } - -arrow::flight::MetadataRecordBatchWriter * -gaflight_record_batch_writer_get_raw(GAFlightRecordBatchWriter *writer) -{ - auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(writer); - return priv->writer; -} diff --git a/c_glib/arrow-flight-glib/common.h b/c_glib/arrow-flight-glib/common.h index 91c828caabb36..726132fe4921b 100644 --- a/c_glib/arrow-flight-glib/common.h +++ b/c_glib/arrow-flight-glib/common.h @@ -259,9 +259,9 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer, GAFLIGHT_AVAILABLE_IN_18_0 gboolean -gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer, - GArrowRecordBatch *record_batch, - GArrowBuffer *metadata, - GError **error); +gaflight_record_batch_writer_write_record_batch(GAFlightRecordBatchWriter *writer, + GArrowRecordBatch *record_batch, + GArrowBuffer *metadata, + GError **error); G_END_DECLS diff --git a/c_glib/arrow-glib/writer.hpp b/c_glib/arrow-glib/writer.hpp index aa87ffe77d79b..1d85ac52f88d1 100644 --- a/c_glib/arrow-glib/writer.hpp +++ b/c_glib/arrow-glib/writer.hpp @@ -25,16 +25,20 @@ #include +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchWriter * garrow_record_batch_writer_new_raw( std::shared_ptr *arrow_writer); +GARROW_AVAILABLE_IN_ALL std::shared_ptr garrow_record_batch_writer_get_raw(GArrowRecordBatchWriter *writer); +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchStreamWriter * garrow_record_batch_stream_writer_new_raw( std::shared_ptr *arrow_writer); +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchFileWriter * garrow_record_batch_file_writer_new_raw( std::shared_ptr *arrow_writer); From ef336255d75aa8e13819187563863cf175d4b70b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 27 Aug 2024 10:30:23 +0200 Subject: [PATCH 025/130] GH-43815: [CI][Packaging][Python] Avoid uploading wheel to gemfury if version already exists (#43816) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes are included in this PR? Check whether version exists on gemfury before trying upload ### Are these changes tested? Will be tested via archery ### Are there any user-facing changes? No * GitHub Issue: #43815 Lead-authored-by: Raúl Cumplido Co-authored-by: Sutou Kouhei Signed-off-by: Raúl Cumplido --- dev/tasks/macros.jinja | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index 6423ca0e9efda..df55f32222e91 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -169,10 +169,14 @@ env: - name: Upload package to Gemfury shell: bash run: | - fury push \ - --api-token=${CROSSBOW_GEMFURY_TOKEN} \ - --as=${CROSSBOW_GEMFURY_ORG} \ - {{ pattern }} + if $(fury versions --as=${CROSSBOW_GEMFURY_ORG} --api-token=${CROSSBOW_GEMFURY_TOKEN} pyarrow | grep --fixed-strings -q "{{ arrow.no_rc_version }}"); then + echo "Version {{ arrow.no_rc_version }} already exists. Avoid pushing version." + else + fury push \ + --api-token=${CROSSBOW_GEMFURY_TOKEN} \ + --as=${CROSSBOW_GEMFURY_ORG} \ + {{ pattern }} + fi env: CROSSBOW_GEMFURY_TOKEN: {{ '${{ secrets.CROSSBOW_GEMFURY_TOKEN }}' }} CROSSBOW_GEMFURY_ORG: {{ '${{ secrets.CROSSBOW_GEMFURY_ORG }}' }} From ab38581f0f73e5d1e769c617e42e60e9f58c68dc Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com> Date: Tue, 27 Aug 2024 13:17:39 -0400 Subject: [PATCH 026/130] GH-43790: [Go][Parquet] Add support for LZ4_RAW compression codec (#43835) ### Rationale for this change Fixes: #43790 The LZ4 compression codec for Parquet is no longer ambiguous, as it has been superceded by the [LZ4_RAW](https://github.com/apache/parquet-format/blob/master/Compression.md#lz4_raw) spec. ### What changes are included in this PR? - Add `LZ4Raw` compression codec - Split out `StreamingCodec` methods from core `Codec` interface - Various conformance/roundtrip tests - Set of benchmarks for reading/writing an Arrow table to/from Parquet, using each compression codec ### Are these changes tested? Yes ### Are there any user-facing changes? - New codec `LZ4Raw` is available - `Codec` interface no long provides the following methods, which are now part of `StreamingCodec`: - `NewReader` - `NewWriter` - `NewWriterLevel` * GitHub Issue: #43790 Authored-by: Joel Lubinitsky Signed-off-by: Joel Lubinitsky --- go/parquet/compress/compress.go | 22 ++-- go/parquet/compress/compress_test.go | 8 +- go/parquet/compress/lz4_raw.go | 66 ++++++++++++ go/parquet/file/file_reader_test.go | 127 +++++++++++++++++++++++ go/parquet/file/file_writer_test.go | 58 ++++++++++- go/parquet/pqarrow/reader_writer_test.go | 111 ++++++++++++++++++++ 6 files changed, 380 insertions(+), 12 deletions(-) create mode 100644 go/parquet/compress/lz4_raw.go diff --git a/go/parquet/compress/compress.go b/go/parquet/compress/compress.go index b6a1349133e84..92f2ae99bb13f 100644 --- a/go/parquet/compress/compress.go +++ b/go/parquet/compress/compress.go @@ -49,8 +49,9 @@ var Codecs = struct { Brotli Compression // LZ4 unsupported in this library due to problematic issues between the Hadoop LZ4 spec vs regular lz4 // see: http://mail-archives.apache.org/mod_mbox/arrow-dev/202007.mbox/%3CCAAri41v24xuA8MGHLDvgSnE+7AAgOhiEukemW_oPNHMvfMmrWw@mail.gmail.com%3E - Lz4 Compression - Zstd Compression + Lz4 Compression + Zstd Compression + Lz4Raw Compression }{ Uncompressed: Compression(parquet.CompressionCodec_UNCOMPRESSED), Snappy: Compression(parquet.CompressionCodec_SNAPPY), @@ -59,17 +60,12 @@ var Codecs = struct { Brotli: Compression(parquet.CompressionCodec_BROTLI), Lz4: Compression(parquet.CompressionCodec_LZ4), Zstd: Compression(parquet.CompressionCodec_ZSTD), + Lz4Raw: Compression(parquet.CompressionCodec_LZ4_RAW), } // Codec is an interface which is implemented for each compression type in order to make the interactions easy to // implement. Most consumers won't be calling GetCodec directly. type Codec interface { - // NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data - NewReader(io.Reader) io.ReadCloser - // NewWriter provides a wrapper around a write stream to compress data before writing it. - NewWriter(io.Writer) io.WriteCloser - // NewWriterLevel is like NewWriter but allows specifying the compression level - NewWriterLevel(io.Writer, int) (io.WriteCloser, error) // Encode encodes a block of data given by src and returns the compressed block. dst should be either nil // or sized large enough to fit the compressed block (use CompressBound to allocate). dst and src should not // overlap since some of the compression types don't allow it. @@ -90,6 +86,16 @@ type Codec interface { Decode(dst, src []byte) []byte } +// StreamingCodec is an interface that may be implemented for compression codecs that expose a streaming API. +type StreamingCodec interface { + // NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data + NewReader(io.Reader) io.ReadCloser + // NewWriter provides a wrapper around a write stream to compress data before writing it. + NewWriter(io.Writer) io.WriteCloser + // NewWriterLevel is like NewWriter but allows specifying the compression level + NewWriterLevel(io.Writer, int) (io.WriteCloser, error) +} + var codecs = map[Compression]Codec{} // RegisterCodec adds or overrides a codec implementation for a given compression algorithm. diff --git a/go/parquet/compress/compress_test.go b/go/parquet/compress/compress_test.go index 843062c0d024a..5aac74759e1f9 100644 --- a/go/parquet/compress/compress_test.go +++ b/go/parquet/compress/compress_test.go @@ -66,8 +66,8 @@ func TestCompressDataOneShot(t *testing.T) { {compress.Codecs.Gzip}, {compress.Codecs.Brotli}, {compress.Codecs.Zstd}, + {compress.Codecs.Lz4Raw}, // {compress.Codecs.Lzo}, - // {compress.Codecs.Lz4}, } for _, tt := range tests { @@ -107,9 +107,11 @@ func TestCompressReaderWriter(t *testing.T) { var buf bytes.Buffer codec, err := compress.GetCodec(tt.c) assert.NoError(t, err) + streamingCodec, ok := codec.(compress.StreamingCodec) + assert.True(t, ok) data := makeRandomData(RandomDataSize) - wr := codec.NewWriter(&buf) + wr := streamingCodec.NewWriter(&buf) const chunkSize = 1111 input := data @@ -129,7 +131,7 @@ func TestCompressReaderWriter(t *testing.T) { } wr.Close() - rdr := codec.NewReader(&buf) + rdr := streamingCodec.NewReader(&buf) out, err := io.ReadAll(rdr) assert.NoError(t, err) assert.Exactly(t, data, out) diff --git a/go/parquet/compress/lz4_raw.go b/go/parquet/compress/lz4_raw.go new file mode 100644 index 0000000000000..788d9520a668b --- /dev/null +++ b/go/parquet/compress/lz4_raw.go @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package compress + +import ( + "sync" + + "github.com/pierrec/lz4/v4" +) + +// lz4.Compressor is not goroutine-safe, so we use a pool to amortize the cost +// of allocating a new one for each call to Encode(). +var compressorPool = sync.Pool{New: func() interface{} { return new(lz4.Compressor) }} + +func compressBlock(src, dst []byte) (int, error) { + c := compressorPool.Get().(*lz4.Compressor) + defer compressorPool.Put(c) + return c.CompressBlock(src, dst) +} + +type lz4RawCodec struct{} + +func (c lz4RawCodec) Encode(dst, src []byte) []byte { + n, err := compressBlock(src, dst[:cap(dst)]) + if err != nil { + panic(err) + } + + return dst[:n] +} + +func (c lz4RawCodec) EncodeLevel(dst, src []byte, _ int) []byte { + // the lz4 block implementation does not allow level to be set + return c.Encode(dst, src) +} + +func (lz4RawCodec) Decode(dst, src []byte) []byte { + n, err := lz4.UncompressBlock(src, dst) + if err != nil { + panic(err) + } + + return dst[:n] +} + +func (c lz4RawCodec) CompressBound(len int64) int64 { + return int64(lz4.CompressBlockBound(int(len))) +} + +func init() { + RegisterCodec(Codecs.Lz4Raw, lz4RawCodec{}) +} diff --git a/go/parquet/file/file_reader_test.go b/go/parquet/file/file_reader_test.go index 547ec475c2720..35f4da4e8667c 100644 --- a/go/parquet/file/file_reader_test.go +++ b/go/parquet/file/file_reader_test.go @@ -644,3 +644,130 @@ func TestDeltaBinaryPackedMultipleBatches(t *testing.T) { require.Equalf(t, size, totalRows, "Expected %d rows, but got %d rows", size, totalRows) } + +// Test read file lz4_raw_compressed.parquet +// Contents documented at https://github.com/apache/parquet-testing/commit/ddd898958803cb89b7156c6350584d1cda0fe8de +func TestLZ4RawFileRead(t *testing.T) { + dir := os.Getenv("PARQUET_TEST_DATA") + if dir == "" { + t.Skip("no path supplied with PARQUET_TEST_DATA") + } + require.DirExists(t, dir) + + props := parquet.NewReaderProperties(memory.DefaultAllocator) + fileReader, err := file.OpenParquetFile(path.Join(dir, "lz4_raw_compressed.parquet"), + false, file.WithReadProps(props)) + require.NoError(t, err) + defer fileReader.Close() + + nRows := 4 + nCols := 3 + require.Equal(t, 1, fileReader.NumRowGroups()) + rgr := fileReader.RowGroup(0) + require.EqualValues(t, nRows, rgr.NumRows()) + require.EqualValues(t, nCols, rgr.NumColumns()) + + rdr, err := rgr.Column(0) + require.NoError(t, err) + + rowsInt64, ok := rdr.(*file.Int64ColumnChunkReader) + require.True(t, ok) + + valsInt64 := make([]int64, nRows) + total, read, err := rowsInt64.ReadBatch(int64(nRows), valsInt64, nil, nil) + require.NoError(t, err) + require.Equal(t, int64(nRows), total) + require.Equal(t, nRows, read) + + expectedValsInt64 := []int64{ + 1593604800, + 1593604800, + 1593604801, + 1593604801, + } + require.Equal(t, expectedValsInt64, valsInt64) + + rdr, err = rgr.Column(1) + require.NoError(t, err) + + rowsByteArray, ok := rdr.(*file.ByteArrayColumnChunkReader) + require.True(t, ok) + + valsByteArray := make([]parquet.ByteArray, nRows) + total, read, err = rowsByteArray.ReadBatch(int64(nRows), valsByteArray, nil, nil) + require.NoError(t, err) + require.Equal(t, int64(nRows), total) + require.Equal(t, nRows, read) + + expectedValsByteArray := []parquet.ByteArray{ + []byte("abc"), + []byte("def"), + []byte("abc"), + []byte("def"), + } + require.Equal(t, expectedValsByteArray, valsByteArray) + + rdr, err = rgr.Column(2) + require.NoError(t, err) + + rowsFloat64, ok := rdr.(*file.Float64ColumnChunkReader) + require.True(t, ok) + + valsFloat64 := make([]float64, nRows) + total, read, err = rowsFloat64.ReadBatch(int64(nRows), valsFloat64, nil, nil) + require.NoError(t, err) + require.Equal(t, int64(nRows), total) + require.Equal(t, nRows, read) + + expectedValsFloat64 := []float64{ + 42.0, + 7.7, + 42.125, + 7.7, + } + require.Equal(t, expectedValsFloat64, valsFloat64) +} + +// Test read file lz4_raw_compressed_larger.parquet +// Contents documented at https://github.com/apache/parquet-testing/commit/ddd898958803cb89b7156c6350584d1cda0fe8de +func TestLZ4RawLargerFileRead(t *testing.T) { + dir := os.Getenv("PARQUET_TEST_DATA") + if dir == "" { + t.Skip("no path supplied with PARQUET_TEST_DATA") + } + require.DirExists(t, dir) + + props := parquet.NewReaderProperties(memory.DefaultAllocator) + fileReader, err := file.OpenParquetFile(path.Join(dir, "lz4_raw_compressed_larger.parquet"), + false, file.WithReadProps(props)) + require.NoError(t, err) + defer fileReader.Close() + + nRows := 10000 + nCols := 1 + require.Equal(t, 1, fileReader.NumRowGroups()) + rgr := fileReader.RowGroup(0) + require.EqualValues(t, nRows, rgr.NumRows()) + require.EqualValues(t, nCols, rgr.NumColumns()) + + rdr, err := rgr.Column(0) + require.NoError(t, err) + + rows, ok := rdr.(*file.ByteArrayColumnChunkReader) + require.True(t, ok) + + vals := make([]parquet.ByteArray, nRows) + total, read, err := rows.ReadBatch(int64(nRows), vals, nil, nil) + require.NoError(t, err) + require.Equal(t, int64(nRows), total) + require.Equal(t, nRows, read) + + expectedValsHead := []parquet.ByteArray{ + []byte("c7ce6bef-d5b0-4863-b199-8ea8c7fb117b"), + []byte("e8fb9197-cb9f-4118-b67f-fbfa65f61843"), + []byte("885136e1-0aa1-4fdb-8847-63d87b07c205"), + []byte("ce7b2019-8ebe-4906-a74d-0afa2409e5df"), + []byte("a9ee2527-821b-4b71-a926-03f73c3fc8b7"), + } + require.Equal(t, expectedValsHead, vals[:len(expectedValsHead)]) +} diff --git a/go/parquet/file/file_writer_test.go b/go/parquet/file/file_writer_test.go index 0faf3f7233bd3..12ac93d1ef4b2 100644 --- a/go/parquet/file/file_writer_test.go +++ b/go/parquet/file/file_writer_test.go @@ -260,7 +260,7 @@ func (t *SerializeTestSuite) TestSmallFile() { compress.Codecs.Brotli, compress.Codecs.Gzip, compress.Codecs.Zstd, - // compress.Codecs.Lz4, + compress.Codecs.Lz4Raw, // compress.Codecs.Lzo, } for _, c := range codecs { @@ -540,3 +540,59 @@ func TestBatchedByteStreamSplitFileRoundtrip(t *testing.T) { require.NoError(t, rdr.Close()) } + +func TestLZ4RawFileRoundtrip(t *testing.T) { + input := []int64{ + -1, 0, 1, 2, 3, 4, 5, 123456789, -123456789, + } + + size := len(input) + + field, err := schema.NewPrimitiveNodeLogical("int64", parquet.Repetitions.Required, nil, parquet.Types.Int64, 0, 1) + require.NoError(t, err) + + schema, err := schema.NewGroupNode("test", parquet.Repetitions.Required, schema.FieldList{field}, 0) + require.NoError(t, err) + + sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) + writer := file.NewParquetWriter(sink, schema, file.WithWriterProps(parquet.NewWriterProperties(parquet.WithCompression(compress.Codecs.Lz4Raw)))) + + rgw := writer.AppendRowGroup() + cw, err := rgw.NextColumn() + require.NoError(t, err) + + i64ColumnWriter, ok := cw.(*file.Int64ColumnChunkWriter) + require.True(t, ok) + + nVals, err := i64ColumnWriter.WriteBatch(input, nil, nil) + require.NoError(t, err) + require.EqualValues(t, size, nVals) + + require.NoError(t, cw.Close()) + require.NoError(t, rgw.Close()) + require.NoError(t, writer.Close()) + + rdr, err := file.NewParquetReader(bytes.NewReader(sink.Bytes())) + require.NoError(t, err) + + require.Equal(t, 1, rdr.NumRowGroups()) + require.EqualValues(t, size, rdr.NumRows()) + + rgr := rdr.RowGroup(0) + cr, err := rgr.Column(0) + require.NoError(t, err) + + i64ColumnReader, ok := cr.(*file.Int64ColumnChunkReader) + require.True(t, ok) + + output := make([]int64, size) + + total, valuesRead, err := i64ColumnReader.ReadBatch(int64(size), output, nil, nil) + require.NoError(t, err) + require.EqualValues(t, size, total) + require.EqualValues(t, size, valuesRead) + + require.Equal(t, input, output) + + require.NoError(t, rdr.Close()) +} diff --git a/go/parquet/pqarrow/reader_writer_test.go b/go/parquet/pqarrow/reader_writer_test.go index 31bd0eba84388..e020c7d9457a9 100644 --- a/go/parquet/pqarrow/reader_writer_test.go +++ b/go/parquet/pqarrow/reader_writer_test.go @@ -19,6 +19,8 @@ package pqarrow_test import ( "bytes" "context" + "fmt" + "math" "testing" "unsafe" @@ -26,8 +28,10 @@ import ( "github.com/apache/arrow/go/v18/arrow/array" "github.com/apache/arrow/go/v18/arrow/memory" "github.com/apache/arrow/go/v18/parquet" + "github.com/apache/arrow/go/v18/parquet/compress" "github.com/apache/arrow/go/v18/parquet/file" "github.com/apache/arrow/go/v18/parquet/pqarrow" + "github.com/stretchr/testify/require" "golang.org/x/exp/rand" "gonum.org/v1/gonum/stat/distuv" ) @@ -275,3 +279,110 @@ func BenchmarkReadColumnFloat64(b *testing.B) { benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN))) } } + +var compressTestCases = []struct { + c compress.Compression +}{ + {compress.Codecs.Uncompressed}, + {compress.Codecs.Snappy}, + {compress.Codecs.Gzip}, + {compress.Codecs.Brotli}, + {compress.Codecs.Zstd}, + {compress.Codecs.Lz4Raw}, + // {compress.Codecs.Lzo}, +} + +func buildTableForTest(mem memory.Allocator) arrow.Table { + schema := arrow.NewSchema( + []arrow.Field{ + {Name: "int64s", Type: arrow.PrimitiveTypes.Int64}, + {Name: "strings", Type: arrow.BinaryTypes.String}, + {Name: "bools", Type: arrow.FixedWidthTypes.Boolean}, + {Name: "repeated_int64s", Type: arrow.PrimitiveTypes.Int64}, + {Name: "repeated_strings", Type: arrow.BinaryTypes.String}, + {Name: "repeated_bools", Type: arrow.FixedWidthTypes.Boolean}, + }, + nil, + ) + bldr := array.NewRecordBuilder(mem, schema) + defer bldr.Release() + + for i := 0; i < SIZELEN; i++ { + bldr.Field(0).(*array.Int64Builder).Append(int64(i)) + bldr.Field(1).(*array.StringBuilder).Append(fmt.Sprint(i)) + bldr.Field(2).(*array.BooleanBuilder).Append(i%2 == 0) + bldr.Field(3).(*array.Int64Builder).Append(0) + bldr.Field(4).(*array.StringBuilder).Append("the string is the same") + bldr.Field(5).(*array.BooleanBuilder).Append(true) + } + + rec := bldr.NewRecord() + return array.NewTableFromRecords(schema, []arrow.Record{rec}) +} + +func BenchmarkWriteTableCompressed(b *testing.B) { + mem := memory.DefaultAllocator + table := buildTableForTest(mem) + defer table.Release() + + var uncompressedSize uint64 + for idxCol := 0; int64(idxCol) < table.NumCols(); idxCol++ { + column := table.Column(idxCol) + for _, chunk := range column.Data().Chunks() { + uncompressedSize += chunk.Data().SizeInBytes() + } + } + + var buf bytes.Buffer + buf.Grow(int(uncompressedSize)) + for _, tc := range compressTestCases { + b.Run(fmt.Sprintf("codec=%s", tc.c), func(b *testing.B) { + buf.Reset() + b.ResetTimer() + b.SetBytes(int64(uncompressedSize)) + for n := 0; n < b.N; n++ { + require.NoError(b, + pqarrow.WriteTable( + table, + &buf, + math.MaxInt64, + parquet.NewWriterProperties(parquet.WithAllocator(mem), parquet.WithCompression(tc.c)), + pqarrow.DefaultWriterProps(), + ), + ) + } + }) + } +} + +func BenchmarkReadTableCompressed(b *testing.B) { + ctx := context.Background() + mem := memory.DefaultAllocator + table := buildTableForTest(mem) + defer table.Release() + + for _, tc := range compressTestCases { + b.Run(fmt.Sprintf("codec=%s", tc.c), func(b *testing.B) { + var buf bytes.Buffer + err := pqarrow.WriteTable( + table, + &buf, + math.MaxInt64, + parquet.NewWriterProperties(parquet.WithAllocator(mem), parquet.WithCompression(tc.c)), + pqarrow.DefaultWriterProps(), + ) + require.NoError(b, err) + + compressedBytes := buf.Len() + rdr := bytes.NewReader(buf.Bytes()) + + b.ResetTimer() + b.SetBytes(int64(compressedBytes)) + for n := 0; n < b.N; n++ { + tab, err := pqarrow.ReadTable(ctx, rdr, nil, pqarrow.ArrowReadProperties{}, mem) + require.NoError(b, err) + defer tab.Release() + } + }) + } +} From 581a6db015ea40fe32b025fe1b97e2ff817c36a8 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 27 Aug 2024 19:17:55 +0200 Subject: [PATCH 027/130] MINOR: [CI] Use `docker compose` on self-hosted ARM builds (#43844) ### Rationale for this change The Docker client version on the ARM64 self-hosted runners is now recent enough, so we don't need to use `docker-compose` there anymore. Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .github/workflows/cpp.yml | 5 +---- .github/workflows/go.yml | 5 ----- dev/tasks/java-jars/github.yml | 2 -- dev/tasks/linux-packages/github.linux.yml | 1 - dev/tasks/python-wheels/github.linux.yml | 1 - 5 files changed, 1 insertion(+), 13 deletions(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index a82e1eb76660b..c5482f730823b 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -99,7 +99,6 @@ jobs: cat <> "$GITHUB_OUTPUT" { "arch": "arm64v8", - "archery-use-legacy-docker-compose": "1", "clang-tools": "10", "image": "ubuntu-cpp", "llvm": "10", @@ -124,9 +123,6 @@ jobs: include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} - # By default, use `docker compose` because docker-compose v1 is obsolete, - # except where the Docker client version is too old. - ARCHERY_USE_LEGACY_DOCKER_COMPOSE: ${{ matrix.archery-use-legacy-docker-compose || '0' }} ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} CLANG_TOOLS: ${{ matrix.clang-tools }} LLVM: ${{ matrix.llvm }} @@ -147,6 +143,7 @@ jobs: run: | sudo apt update sudo apt install -y --no-install-recommends python3 python3-dev python3-pip + python3 -m pip install -U pip - name: Setup Archery run: python3 -m pip install -e dev/archery[docker] - name: Execute Docker Build diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 20c78d86cb2a3..ffd543691d5b2 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -78,14 +78,12 @@ jobs: { "arch-label": "ARM64", "arch": "arm64v8", - "archery-use-legacy-docker-compose": "1", "go": "1.21", "runs-on": ["self-hosted", "arm", "linux"] }, { "arch-label": "ARM64", "arch": "arm64v8", - "archery-use-legacy-docker-compose": "1", "go": "1.22", "runs-on": ["self-hosted", "arm", "linux"] } @@ -106,9 +104,6 @@ jobs: include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} - # By default, use Docker CLI because docker-compose v1 is obsolete, - # except where the Docker client version is too old. - ARCHERY_USE_LEGACY_DOCKER_COMPOSE: ${{ matrix.archery-use-legacy-docker-compose || '0' }} GO: ${{ matrix.go }} steps: - name: Checkout Arrow diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index 7cbd5f05dab4a..bdbed1bd678e6 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -30,7 +30,6 @@ jobs: ARCH: {{ '${{ matrix.platform.archery_arch }}' }} ARCH_ALIAS: {{ '${{ matrix.platform.archery_arch_alias }}' }} ARCH_SHORT: {{ '${{ matrix.platform.archery_arch_short }}' }} - ARCHERY_USE_LEGACY_DOCKER_COMPOSE: {{ "${{matrix.platform.archery_use_legacy_docker_compose || '0'}}" }} strategy: fail-fast: false matrix: @@ -45,7 +44,6 @@ jobs: archery_arch: "arm64v8" archery_arch_alias: "aarch64" archery_arch_short: "arm64" - archery_use_legacy_docker_compose: "1" steps: {{ macros.github_checkout_arrow()|indent }} {{ macros.github_free_space()|indent }} diff --git a/dev/tasks/linux-packages/github.linux.yml b/dev/tasks/linux-packages/github.linux.yml index 4bf2295ef3e95..cce976cd60e4e 100644 --- a/dev/tasks/linux-packages/github.linux.yml +++ b/dev/tasks/linux-packages/github.linux.yml @@ -29,7 +29,6 @@ jobs: {% endif %} env: ARCHITECTURE: {{ architecture }} - ARCHERY_USE_LEGACY_DOCKER_COMPOSE: {{ '1' if architecture == 'arm64' else '0' }} steps: {{ macros.github_checkout_arrow()|indent }} {{ macros.github_login_dockerhub()|indent }} diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml index 2854d4349fb7c..97746ba3f9b8b 100644 --- a/dev/tasks/python-wheels/github.linux.yml +++ b/dev/tasks/python-wheels/github.linux.yml @@ -33,7 +33,6 @@ jobs: ARCH: amd64 {% else %} ARCH: arm64v8 - ARCHERY_USE_LEGACY_DOCKER_COMPOSE: 1 {% endif %} PYTHON: "{{ python_version }}" {% if python_version == "3.13" %} From 5dcd5ebb920b755fe7b62fdb06484297c1c3bd5c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 28 Aug 2024 05:47:43 +0900 Subject: [PATCH 028/130] GH-43805: [C++] Enable filesystem automatically when one of ARROW_{AZURE,GCS,HDFS,S3}=ON is specified (#43806) ### Rationale for this change `ARROW_{AZURE,GCS,HDFS,S3}=ON` are meaningful only when filesystem is enabled. If the user specified one of them, we can assume that the user wants to enable filesystem. ### What changes are included in this PR? Enable `ARROW_FILESYSTEM` when one of `ARROW_{AZURE,GCS,HDFS,S3}=ON` are specified. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. `ARROW_FILESYSTEM` is enabled automatically with one of `ARROW_{AZURE,GCS,HDFS,S3}=ON`. * GitHub Issue: #43805 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/DefineOptions.cmake | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 41466a1c22404..755887314d110 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -303,7 +303,10 @@ takes precedence over ccache if a storage backend is configured" ON) ARROW_IPC) define_option(ARROW_AZURE - "Build Arrow with Azure support (requires the Azure SDK for C++)" OFF) + "Build Arrow with Azure support (requires the Azure SDK for C++)" + OFF + DEPENDS + ARROW_FILESYSTEM) define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" OFF) @@ -346,9 +349,16 @@ takes precedence over ccache if a storage backend is configured" ON) ARROW_WITH_UTF8PROC) define_option(ARROW_GCS - "Build Arrow with GCS support (requires the GCloud SDK for C++)" OFF) + "Build Arrow with GCS support (requires the GCloud SDK for C++)" + OFF + DEPENDS + ARROW_FILESYSTEM) - define_option(ARROW_HDFS "Build the Arrow HDFS bridge" OFF) + define_option(ARROW_HDFS + "Build the Arrow HDFS bridge" + OFF + DEPENDS + ARROW_FILESYSTEM) define_option(ARROW_IPC "Build the Arrow IPC extensions" ON) @@ -398,7 +408,11 @@ takes precedence over ccache if a storage backend is configured" ON) ARROW_HDFS ARROW_JSON) - define_option(ARROW_S3 "Build Arrow with S3 support (requires the AWS SDK for C++)" OFF) + define_option(ARROW_S3 + "Build Arrow with S3 support (requires the AWS SDK for C++)" + OFF + DEPENDS + ARROW_FILESYSTEM) define_option(ARROW_SKYHOOK "Build the Skyhook libraries" From fd3df37082d7e7dffa82aba79c52d516f5206633 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Wed, 28 Aug 2024 06:13:31 +0530 Subject: [PATCH 029/130] MINOR: [Java] Logback dependency upgrade (#43842) ### Rationale for this change Fusing https://github.com/apache/arrow/pull/43752 and https://github.com/apache/arrow/pull/43827 dependabot PRs into a single PR. ### What changes are included in this PR? Keeping a single version for both `logback-classic` and `logback-core`. ### Are these changes tested? N/A ### Are there any user-facing changes? No Authored-by: Vibhatha Lakmal Abeykoon Signed-off-by: David Li --- java/memory/memory-netty/pom.xml | 1 - java/pom.xml | 13 ++++++++++++- java/tools/pom.xml | 1 - 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/java/memory/memory-netty/pom.xml b/java/memory/memory-netty/pom.xml index f2d4d2d0fe3bc..6cf573dd4d381 100644 --- a/java/memory/memory-netty/pom.xml +++ b/java/memory/memory-netty/pom.xml @@ -56,7 +56,6 @@ under the License. ch.qos.logback logback-core - 1.3.14 test diff --git a/java/pom.xml b/java/pom.xml index f78d02c0c650f..577f23e6a719c 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -111,6 +111,7 @@ under the License. 5.11.0 5.2.0 3.46.0 + 1.5.7 none -Xdoclint:none @@ -221,6 +222,16 @@ under the License. pom import + + ch.qos.logback + logback-classic + ${logback.version} + + + ch.qos.logback + logback-core + ${logback.version} + @@ -274,7 +285,7 @@ under the License. ch.qos.logback logback-classic - 1.4.14 + ${logback.version} test diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 94566495dff19..082f06860c61b 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -59,7 +59,6 @@ under the License. ch.qos.logback logback-classic - 1.4.14 test From 8da5134c84fb001afe6d702c556883c51320860d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:12:58 -0400 Subject: [PATCH 105/130] MINOR: [Java] Bump io.netty:netty-bom from 4.1.112.Final to 4.1.113.Final in /java (#44022) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [io.netty:netty-bom](https://github.com/netty/netty) from 4.1.112.Final to 4.1.113.Final.
Commits
  • d0a109e [maven-release-plugin] prepare release netty-4.1.113.Final
  • e1d6384 Cleanup fields on AdaptiveByteBuf::deallocate (#14273)
  • 8a02f45 Upload hidden files for staging (#14275)
  • c0fdb8e adjust continuation frame header length (#14245)
  • 95d86bb chore: clean code DefaultChannelPipeline add method (#14249)
  • 1c1da9f Fix netty-all artifact snapshot deployments (#14264)
  • 235eb6f Upgrade to netty-tcnative 2.0.66.Final (#14254)
  • ceade95 Ensure flushes are not discarded by ChunkedWriteHandler for passed th… (#14248)
  • dc30c33 Add new SslHandler.isEncrypted(...) variant that will not produce fal… (#14243)
  • 31d1592 Remove reference to parent in recycled buffers for leak detection (#14250)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=io.netty:netty-bom&package-manager=maven&previous-version=4.1.112.Final&new-version=4.1.113.Final)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Dane Pitkin --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 1c68fde535879..1e22b6b973b9f 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -96,7 +96,7 @@ under the License. 5.11.0 2.0.16 33.2.1-jre - 4.1.112.Final + 4.1.113.Final 1.65.0 3.25.4 2.17.2 From 9a36873978441618d889ca35a5c5ca596115aa33 Mon Sep 17 00:00:00 2001 From: larry98 Date: Tue, 10 Sep 2024 15:08:00 -0400 Subject: [PATCH 106/130] GH-43187: [C++] Support basic is_in predicate simplification (#43761) ### Rationale for this change Prior to https://github.com/apache/arrow/pull/43256, this PR adds a basic implementation that does a linear scan filter over the value set on each guarantee. This isolates the correctness/semantics of `is_in` predicate simplification from the binary search performance optimization. ### What changes are included in this PR? `SimplifyWithGuarantee` now handles `is_in` expressions. ### Are these changes tested? A new unit test was added to arrow-compute-expression-test testing this change. ### Are there any user-facing changes? No. * GitHub Issue: #43187 Lead-authored-by: Larry Wang Co-authored-by: larry98 Co-authored-by: Benjamin Kietzman Signed-off-by: Benjamin Kietzman --- cpp/src/arrow/compute/expression.cc | 73 ++++++++++ cpp/src/arrow/compute/expression_test.cc | 173 +++++++++++++++++++++++ 2 files changed, 246 insertions(+) diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index 33e5928c2865d..12fda5d58f3bf 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -23,6 +23,7 @@ #include #include "arrow/chunked_array.h" +#include "arrow/compute/api_aggregate.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/expression_internal.h" @@ -1242,6 +1243,72 @@ struct Inequality { /*insert_implicit_casts=*/false, &exec_context); } + /// Simplify an `is_in` call against an inequality guarantee. + /// + /// We avoid the complexity of fully simplifying EQUAL comparisons to true + /// literals (e.g., 'x is_in [1, 2, 3]' given the guarantee 'x = 2') due to + /// potential complications with null matching behavior. This is ok for the + /// predicate pushdown use case because the overall aim is to simplify to an + /// unsatisfiable expression. + /// + /// \pre `is_in_call` is a call to the `is_in` function + /// \return a simplified expression, or nullopt if no simplification occurred + static Result> SimplifyIsIn( + const Inequality& guarantee, const Expression::Call* is_in_call) { + DCHECK_EQ(is_in_call->function_name, "is_in"); + + auto options = checked_pointer_cast(is_in_call->options); + + const auto& lhs = Comparison::StripOrderPreservingCasts(is_in_call->arguments[0]); + if (!lhs.field_ref()) return std::nullopt; + if (*lhs.field_ref() != guarantee.target) return std::nullopt; + + FilterOptions::NullSelectionBehavior null_selection; + switch (options->null_matching_behavior) { + case SetLookupOptions::MATCH: + null_selection = + guarantee.nullable ? FilterOptions::EMIT_NULL : FilterOptions::DROP; + break; + case SetLookupOptions::SKIP: + null_selection = FilterOptions::DROP; + break; + case SetLookupOptions::EMIT_NULL: + if (guarantee.nullable) return std::nullopt; + null_selection = FilterOptions::DROP; + break; + case SetLookupOptions::INCONCLUSIVE: + if (guarantee.nullable) return std::nullopt; + ARROW_ASSIGN_OR_RAISE(Datum is_null, IsNull(options->value_set)); + ARROW_ASSIGN_OR_RAISE(Datum any_null, Any(is_null)); + if (any_null.scalar_as().value) return std::nullopt; + null_selection = FilterOptions::DROP; + break; + } + + std::string func_name = Comparison::GetName(guarantee.cmp); + DCHECK_NE(func_name, "na"); + std::vector args{options->value_set, guarantee.bound}; + ARROW_ASSIGN_OR_RAISE(Datum filter_mask, CallFunction(func_name, args)); + FilterOptions filter_options(null_selection); + ARROW_ASSIGN_OR_RAISE(Datum simplified_value_set, + Filter(options->value_set, filter_mask, filter_options)); + + if (simplified_value_set.length() == 0) return literal(false); + if (simplified_value_set.length() == options->value_set.length()) return std::nullopt; + + ExecContext exec_context; + Expression::Call simplified_call; + simplified_call.function_name = "is_in"; + simplified_call.arguments = is_in_call->arguments; + simplified_call.options = std::make_shared( + simplified_value_set, options->null_matching_behavior); + ARROW_ASSIGN_OR_RAISE( + Expression simplified_expr, + BindNonRecursive(std::move(simplified_call), + /*insert_implicit_casts=*/false, &exec_context)); + return simplified_expr; + } + /// \brief Simplify the given expression given this inequality as a guarantee. Result Simplify(Expression expr) { const auto& guarantee = *this; @@ -1258,6 +1325,12 @@ struct Inequality { return call->function_name == "is_valid" ? literal(true) : literal(false); } + if (call->function_name == "is_in") { + ARROW_ASSIGN_OR_RAISE(std::optional result, + SimplifyIsIn(guarantee, call)); + return result.value_or(expr); + } + auto cmp = Comparison::Get(expr); if (!cmp) return expr; diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index d94a17b6ffadf..0b7e8a9c23b13 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -27,6 +27,7 @@ #include #include +#include "arrow/array/builder_primitive.h" #include "arrow/compute/expression_internal.h" #include "arrow/compute/function_internal.h" #include "arrow/compute/registry.h" @@ -1616,6 +1617,144 @@ TEST(Expression, SimplifyWithComparisonAndNullableCaveat) { true_unless_null(field_ref("i32")))); // not satisfiable, will drop row group } +TEST(Expression, SimplifyIsIn) { + auto is_in = [](Expression field, std::shared_ptr value_set_type, + std::string json_array, + SetLookupOptions::NullMatchingBehavior null_matching_behavior) { + SetLookupOptions options{ArrayFromJSON(value_set_type, json_array), + null_matching_behavior}; + return call("is_in", {field}, options); + }; + + for (SetLookupOptions::NullMatchingBehavior null_matching : { + SetLookupOptions::MATCH, + SetLookupOptions::SKIP, + SetLookupOptions::EMIT_NULL, + SetLookupOptions::INCONCLUSIVE, + }) { + Simplify{is_in(field_ref("i32"), int32(), "[]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(equal(field_ref("i32"), literal(6))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(3))) + .Expect(is_in(field_ref("i32"), int32(), "[5,7,9]", null_matching)); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(9))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(less_equal(field_ref("i32"), literal(0))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(0))) + .ExpectUnchanged(); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(less_equal(field_ref("i32"), literal(9))) + .ExpectUnchanged(); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(and_(less_equal(field_ref("i32"), literal(7)), + greater(field_ref("i32"), literal(4)))) + .Expect(is_in(field_ref("i32"), int32(), "[5,7]", null_matching)); + + Simplify{is_in(field_ref("u32"), int8(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("u32"), literal(3))) + .Expect(is_in(field_ref("u32"), int8(), "[5,7,9]", null_matching)); + + Simplify{is_in(field_ref("u32"), int64(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("u32"), literal(3))) + .Expect(is_in(field_ref("u32"), int64(), "[5,7,9]", null_matching)); + } + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::MATCH), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::MATCH), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::MATCH), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3,null]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::SKIP), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::SKIP), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::SKIP), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::EMIT_NULL)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); +} + TEST(Expression, SimplifyThenExecute) { auto filter = or_({equal(field_ref("f32"), literal(0)), @@ -1643,6 +1782,40 @@ TEST(Expression, SimplifyThenExecute) { AssertDatumsEqual(evaluated, simplified_evaluated, /*verbose=*/true); } +TEST(Expression, SimplifyIsInThenExecute) { + auto input = RecordBatchFromJSON(kBoringSchema, R"([ + {"i64": 2, "i32": 5}, + {"i64": 5, "i32": 6}, + {"i64": 3, "i32": 6}, + {"i64": 3, "i32": 5}, + {"i64": 4, "i32": 5}, + {"i64": 2, "i32": 7}, + {"i64": 5, "i32": 5} + ])"); + + std::vector guarantees{greater(field_ref("i64"), literal(1)), + greater_equal(field_ref("i32"), literal(5)), + less_equal(field_ref("i64"), literal(5))}; + + for (const Expression& guarantee : guarantees) { + auto filter = + call("is_in", {guarantee.call()->arguments[0]}, + compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2,3]"), true}); + ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*kBoringSchema)); + ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(filter, guarantee)); + + Datum evaluated, simplified_evaluated; + ExpectExecute(filter, input, &evaluated); + ExpectExecute(simplified, input, &simplified_evaluated); + if (simplified_evaluated.is_scalar()) { + ASSERT_OK_AND_ASSIGN( + simplified_evaluated, + MakeArrayFromScalar(*simplified_evaluated.scalar(), evaluated.length())); + } + AssertDatumsEqual(evaluated, simplified_evaluated, /*verbose=*/true); + } +} + TEST(Expression, Filter) { auto ExpectFilter = [](Expression filter, std::string batch_json) { ASSERT_OK_AND_ASSIGN(auto s, kBoringSchema->AddField(0, field("in", boolean()))); From b28d202ef535b4312feb098e6e4786553fa46330 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 10 Sep 2024 15:51:53 -0400 Subject: [PATCH 107/130] GH-43956: [Format] Allow Decimal32/Decimal64 in format (#43976) ### Rationale for this change Widening the Decimal128/256 type to allow for bitwidths of 32 and 64 allows for more interoperability with other libraries and utilities which already support these types. This provides even more opportunities for zero-copy interactions between things such as libcudf and various databases. ### What changes are included in this PR? Updating the documentation in Schema.fbs to explicitly state that 32-bit and 64-bit is now allowed for bitwidths of Decimal types. This is the only area in the the spec that mentions the allowed decimal bitwidths. * GitHub Issue: #43956 --------- Co-authored-by: Antoine Pitrou --- docs/source/format/Columnar.rst | 2 +- docs/source/format/Versioning.rst | 5 +++++ format/Schema.fbs | 9 +++++---- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 4c758c5294325..697c39b0cb1d9 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -21,7 +21,7 @@ Arrow Columnar Format ********************* -*Version: 1.4* +*Version: 1.5* .. seealso:: :ref:`Additions to the Arrow columnar format since version 1.0.0 ` diff --git a/docs/source/format/Versioning.rst b/docs/source/format/Versioning.rst index 8fcf11b21f0cc..d46d07a90906c 100644 --- a/docs/source/format/Versioning.rst +++ b/docs/source/format/Versioning.rst @@ -105,3 +105,8 @@ Version 1.4 * Added :ref:`listview-layout` and the associated ListView and LargeListView types. * Added :ref:`variadic-buffers`. + +Version 1.5 +----------- + +* Expanded Decimal type bit widths to allow 32-bit and 64-bit types. diff --git a/format/Schema.fbs b/format/Schema.fbs index a03ca31ae97c4..e8e14b112a771 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -24,6 +24,7 @@ /// Version 1.3 - Add Run-End Encoded. /// Version 1.4 - Add BinaryView, Utf8View, variadicBufferCounts, ListView, and /// LargeListView. +/// Version 1.5 - Add 32-bit and 64-bit as allowed bit widths for Decimal namespace org.apache.arrow.flatbuf; @@ -222,9 +223,9 @@ table RunEndEncoded { } /// Exact decimal value represented as an integer value in two's -/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers -/// are used. The representation uses the endianness indicated -/// in the Schema. +/// complement. Currently 32-bit (4-byte), 64-bit (8-byte), +/// 128-bit (16-byte) and 256-bit (32-byte) integers are used. +/// The representation uses the endianness indicated in the Schema. table Decimal { /// Total number of decimal digits precision: int; @@ -232,7 +233,7 @@ table Decimal { /// Number of digits after the decimal point "." scale: int; - /// Number of bits per value. The only accepted widths are 128 and 256. + /// Number of bits per value. The accepted widths are 32, 64, 128 and 256. /// We use bitWidth for consistency with Int::bitWidth. bitWidth: int = 128; } From b1cf8b6539ae63db1f527601af8ba98ed386461c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:06:14 -0400 Subject: [PATCH 108/130] MINOR: [Java] Bump com.google.guava:guava-bom from 33.2.1-jre to 33.3.0-jre in /java (#43750) Bumps [com.google.guava:guava-bom](https://github.com/google/guava) from 33.2.1-jre to 33.3.0-jre.
Release notes

Sourced from com.google.guava:guava-bom's releases.

33.3.0

Maven

<dependency>
  <groupId>com.google.guava</groupId>
  <artifactId>guava</artifactId>
  <version>33.3.0-jre</version>
  <!-- or, for Android: -->
  <version>33.3.0-android</version>
</dependency>

Jar files

Guava requires one runtime dependency, which you can download here:

Javadoc

JDiff

Changelog

  • base: Removed @ Beta from the Duration overload of Suppliers.memoizeWithExpiration. (76fca99db95ce9c8e55bb9c37fd0e44ef0451a80)
  • cache: Added CacheBuilder Duration overloads to guava-android. (a5f9bcafd6)
  • collect: Removed @ Beta from the guava-android Collector APIs. (c86c09dc3d)
  • collect: Added ImmutableMultimap.builderWithExpectedKeys and ImmutableMultimap.Builder.expectedValuesPerKey. (c3d5b17dc2)
  • graph: Improved Graphs.hasCycle to avoid causing StackOverflowError for long paths. (63734b9dfc)
  • net: Added text/markdown to MediaType. (2466a099ae)
  • net: Deprecated HttpHeaders constant for Sec-Ch-UA-Form-Factor in favor of Sec-Ch-UA-Form-Factors to follow the latest spec. (b310b7e1ee)
  • testing: Changed some test libraries to throw AssertionError (instead of the more specific AssertionFailedError) in some cases. (fdfbed1985)
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.google.guava:guava-bom&package-manager=maven&previous-version=33.2.1-jre&new-version=33.3.0-jre)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Dane Pitkin --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 1e22b6b973b9f..02ec57a5032df 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -95,7 +95,7 @@ under the License. 1.9.0 5.11.0 2.0.16 - 33.2.1-jre + 33.3.0-jre 4.1.113.Final 1.65.0 3.25.4 From 2fc9dc1e0ea184b485ce1157cb000305908d8828 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:06:45 -0400 Subject: [PATCH 109/130] MINOR: [Java] Bump checker.framework.version from 3.46.0 to 3.47.0 in /java (#44021) Bumps `checker.framework.version` from 3.46.0 to 3.47.0. Updates `org.checkerframework:checker-qual` from 3.46.0 to 3.47.0
Release notes

Sourced from org.checkerframework:checker-qual's releases.

Checker Framework 3.47.0

Version 3.47.0 (September 3, 2024)

User-visible changes:

The Checker Framework runs under JDK 22 -- that is, it runs on a version 22 JVM. The Checker Framework runs under JDK 23 -- that is, it runs on a version 23 JVM.

The Optional Checker no longer supports the @ OptionalBottom annotation.

Implementation details:

Removed annotations:

  • @ OptionalBottom

Closed issues:

#6510, #6704, #6743, #6749, #6760, #6761.

Changelog

Sourced from org.checkerframework:checker-qual's changelog.

Version 3.47.0 (October 1, 2024)

User-visible changes:

Implementation details:

Closed issues:

Version 3.47.0 (September 3, 2024)

User-visible changes:

The Checker Framework runs under JDK 22 -- that is, it runs on a version 22 JVM. The Checker Framework runs under JDK 23 -- that is, it runs on a version 23 JVM.

The Optional Checker no longer supports the @ OptionalBottom annotation.

Implementation details:

Removed annotations:

  • @ OptionalBottom

Closed issues:

#6510, #6704, #6743, #6749, #6760, #6761.

Commits
  • 2f788fe new release 3.47.0
  • 2d0d20b Prep for release.
  • 0aeb0a4 Removing the @ OptionalBottom type and annotation (#6772)
  • 87f9d44 Support Java 23
  • c16094b Remove resolveDependencies target (#6775)
  • c27f651 Don't use /// comments, whose content must be Markdown in Java 23
  • cb70fb7 Update dependency com.amazonaws:aws-java-sdk-bom to v1.12.770 (#6773)
  • 07940f7 Update versions.errorprone to v2.31.0 (#6771)
  • 7b2378e Support Java 22
  • c5cc9d8 Update dependency io.github.classgraph:classgraph to v4.8.175 (#6766)
  • Additional commits viewable in compare view

Updates `org.checkerframework:checker` from 3.46.0 to 3.47.0
Release notes

Sourced from org.checkerframework:checker's releases.

Checker Framework 3.47.0

Version 3.47.0 (September 3, 2024)

User-visible changes:

The Checker Framework runs under JDK 22 -- that is, it runs on a version 22 JVM. The Checker Framework runs under JDK 23 -- that is, it runs on a version 23 JVM.

The Optional Checker no longer supports the @ OptionalBottom annotation.

Implementation details:

Removed annotations:

  • @ OptionalBottom

Closed issues:

#6510, #6704, #6743, #6749, #6760, #6761.

Changelog

Sourced from org.checkerframework:checker's changelog.

Version 3.47.0 (October 1, 2024)

User-visible changes:

Implementation details:

Closed issues:

Version 3.47.0 (September 3, 2024)

User-visible changes:

The Checker Framework runs under JDK 22 -- that is, it runs on a version 22 JVM. The Checker Framework runs under JDK 23 -- that is, it runs on a version 23 JVM.

The Optional Checker no longer supports the @ OptionalBottom annotation.

Implementation details:

Removed annotations:

  • @ OptionalBottom

Closed issues:

#6510, #6704, #6743, #6749, #6760, #6761.

Commits
  • 2f788fe new release 3.47.0
  • 2d0d20b Prep for release.
  • 0aeb0a4 Removing the @ OptionalBottom type and annotation (#6772)
  • 87f9d44 Support Java 23
  • c16094b Remove resolveDependencies target (#6775)
  • c27f651 Don't use /// comments, whose content must be Markdown in Java 23
  • cb70fb7 Update dependency com.amazonaws:aws-java-sdk-bom to v1.12.770 (#6773)
  • 07940f7 Update versions.errorprone to v2.31.0 (#6771)
  • 7b2378e Support Java 22
  • c5cc9d8 Update dependency io.github.classgraph:classgraph to v4.8.175 (#6766)
  • Additional commits viewable in compare view

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Dane Pitkin --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 02ec57a5032df..808b0ad4d8cc7 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -110,7 +110,7 @@ under the License. 2.31.0 5.11.0 5.2.0 - 3.46.0 + 3.47.0 1.5.8 none -Xdoclint:none From d658f6484a170fba9c93334982553eecedadfb19 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 11 Sep 2024 02:19:59 +0200 Subject: [PATCH 110/130] MINOR: [CI][C++] Enable core dumps and stack traces in Linux/macOS jobs (#43937) ### Rationale for this change In https://github.com/apache/arrow/pull/43936 I noticed that core dumps were not written out for crashing C++ tests. One problem is that, by default, Ubuntu hosts pipe core dumps to `apport`, but it is not available inside containers. Another is that the `ulimit` must be set in the host, not in the container. In addition, this PR restores automatic traceback generation when running C++ tests, on Linux and macOS jobs. ### Are these changes tested? Manually by introducing a spurious segfault and running Docker containers. ### Are there any user-facing changes? No. Lead-authored-by: Antoine Pitrou Co-authored-by: Antoine Pitrou Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/cpp.yml | 5 ++- .github/workflows/dev.yml | 3 +- .github/workflows/integration.yml | 1 + .github/workflows/java_jni.yml | 4 ++- .github/workflows/js.yml | 3 +- .github/workflows/python.yml | 3 +- .github/workflows/r.yml | 6 ++-- .github/workflows/ruby.yml | 3 +- .github/workflows/swift.yml | 3 +- ci/docker/fedora-39-cpp.dockerfile | 1 + ci/docker/ubuntu-20.04-cpp-minimal.dockerfile | 1 + ci/docker/ubuntu-22.04-cpp-minimal.dockerfile | 1 + ci/docker/ubuntu-24.04-cpp-minimal.dockerfile | 1 + ci/scripts/util_enable_core_dumps.sh | 33 +++++++++++++++++++ cpp/build-support/run-test.sh | 23 ++++++++----- dev/tasks/docker-tests/github.cuda.yml | 1 + dev/tasks/docker-tests/github.linux.yml | 1 + dev/tasks/python-wheels/github.linux.yml | 1 + dev/tasks/r/github.packages.yml | 3 +- docker-compose.yml | 4 +-- 20 files changed, 70 insertions(+), 31 deletions(-) create mode 100644 ci/scripts/util_enable_core_dumps.sh diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 4a01d2f8e3aab..f5c8b6a7201be 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -155,8 +155,7 @@ jobs: run: | # GH-40558: reduce ASLR to avoid ASAN/LSAN crashes sudo sysctl -w vm.mmap_rnd_bits=28 - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run ${{ matrix.image }} - name: Docker Push if: >- @@ -272,7 +271,7 @@ jobs: shell: bash run: | sudo sysctl -w kern.coredump=1 - sudo sysctl -w kern.corefile=core.%N.%P + sudo sysctl -w kern.corefile=/tmp/core.%N.%P ulimit -c unlimited # must enable within the same shell ci/scripts/cpp_test.sh $(pwd) $(pwd)/build diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 1cc8d993498b6..3879a045fd239 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -67,8 +67,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run -e GITHUB_ACTIONS=true ubuntu-lint - name: Docker Push if: >- diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index ecf89bff8f600..2d19b1e59b27a 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -101,6 +101,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: > + source ci/scripts/util_enable_core_dumps.sh archery docker run \ -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \ -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \ diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index f2ecc801dc724..e730a5bf3e672 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -81,7 +81,9 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: archery docker run java-jni-manylinux-2014 + run: | + source ci/scripts/util_enable_core_dumps.sh + archery docker run java-jni-manylinux-2014 - name: Docker Push if: >- success() && diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 17b57c42b62f6..9ab4edf0851cd 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -66,8 +66,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run debian-js - name: Docker Push if: >- diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 6e83b727593b4..45efd305aa8f6 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -119,8 +119,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run ${{ matrix.image }} - name: Docker Push if: >- diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index bd1631db4f617..92e0e63fb7ea5 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -158,8 +158,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh # Setting a non-default and non-probable Marquesas French Polynesia time # it has both with a .45 offset and very very few people who live there. archery docker run -e TZ=MART -e ARROW_R_FORCE_TESTS=${{ matrix.force-tests }} ubuntu-r @@ -218,8 +217,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh # Don't set a TZ here to test that case. These builds will have the following warning in them: # System has not been booted with systemd as init system (PID 1). Can't operate. # Failed to connect to bus: Host is down diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index c4a7f31f4a94c..05b7b317ffd96 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -95,8 +95,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run \ -e ARROW_FLIGHT=ON \ -e ARROW_FLIGHT_SQL=ON \ diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml index 86eb113dfc833..87aa5cb83f714 100644 --- a/.github/workflows/swift.yml +++ b/.github/workflows/swift.yml @@ -65,8 +65,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run ubuntu-swift - name: Docker Push if: >- diff --git a/ci/docker/fedora-39-cpp.dockerfile b/ci/docker/fedora-39-cpp.dockerfile index 33d11823094ce..2ac5afe7b91f6 100644 --- a/ci/docker/fedora-39-cpp.dockerfile +++ b/ci/docker/fedora-39-cpp.dockerfile @@ -34,6 +34,7 @@ RUN dnf update -y && \ curl-devel \ gcc \ gcc-c++ \ + gdb \ gflags-devel \ git \ glog-devel \ diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index 4d867a448c994..1b342df596c9d 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -29,6 +29,7 @@ RUN apt-get update -y -q && \ ccache \ cmake \ curl \ + gdb \ git \ libssl-dev \ libcurl4-openssl-dev \ diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index f26cad51f0983..ce31c457e909e 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -29,6 +29,7 @@ RUN apt-get update -y -q && \ ccache \ cmake \ curl \ + gdb \ git \ libssl-dev \ libcurl4-openssl-dev \ diff --git a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile index 125bc7ba46a81..a1fd178a2c754 100644 --- a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile @@ -29,6 +29,7 @@ RUN apt-get update -y -q && \ ccache \ cmake \ curl \ + gdb \ git \ libssl-dev \ libcurl4-openssl-dev \ diff --git a/ci/scripts/util_enable_core_dumps.sh b/ci/scripts/util_enable_core_dumps.sh new file mode 100644 index 0000000000000..09f8d2d727099 --- /dev/null +++ b/ci/scripts/util_enable_core_dumps.sh @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# NOTE: this script is not marked executable as it should be source'd +# for `ulimit` to take effect. + +set -e + +platform=$(uname) + +if [ "${platform}" = "Linux" ]; then + # We need to override `core_pattern` because + # 1. the original setting may reference apport, which is not available under + # most Docker containers; + # 2. we want to write the core file in a well-known directory. + sudo sysctl -w kernel.core_pattern="/tmp/core.%e.%p" +fi + +ulimit -c unlimited diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh index 8e42438a23c1c..55e3fe0980749 100755 --- a/cpp/build-support/run-test.sh +++ b/cpp/build-support/run-test.sh @@ -121,12 +121,15 @@ function print_coredumps() { # patterns must be set with prefix `core.{test-executable}*`: # # In case of macOS: - # sudo sysctl -w kern.corefile=core.%N.%P + # sudo sysctl -w kern.corefile=/tmp/core.%N.%P # On Linux: - # sudo sysctl -w kernel.core_pattern=core.%e.%p + # sudo sysctl -w kernel.core_pattern=/tmp/core.%e.%p # # and the ulimit must be increased: # ulimit -c unlimited + # + # If the tests are run in a Docker container, the instructions are slightly + # different: see the 'Coredumps' comment section in `docker-compose.yml`. # filename is truncated to the first 15 characters in case of linux, so limit # the pattern for the first 15 characters @@ -134,19 +137,21 @@ function print_coredumps() { FILENAME=$(echo ${FILENAME} | cut -c-15) PATTERN="^core\.${FILENAME}" - COREFILES=$(ls | grep $PATTERN) + COREFILES=$(ls /tmp | grep $PATTERN) if [ -n "$COREFILES" ]; then - echo "Found core dump, printing backtrace:" - for COREFILE in $COREFILES; do + COREPATH="/tmp/${COREFILE}" + echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" + echo "Running '${TEST_EXECUTABLE}' produced core dump at '${COREPATH}', printing backtrace:" # Print backtrace if [ "$(uname)" == "Darwin" ]; then - lldb -c "${COREFILE}" --batch --one-line "thread backtrace all -e true" + lldb -c "${COREPATH}" --batch --one-line "thread backtrace all -e true" else - gdb -c "${COREFILE}" $TEST_EXECUTABLE -ex "thread apply all bt" -ex "set pagination 0" -batch + gdb -c "${COREPATH}" $TEST_EXECUTABLE -ex "thread apply all bt" -ex "set pagination 0" -batch fi - # Remove the coredump, regenerate it via running the test case directly - rm "${COREFILE}" + echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" + # Remove the coredump, it can be regenerated via running the test case directly + rm "${COREPATH}" done fi } diff --git a/dev/tasks/docker-tests/github.cuda.yml b/dev/tasks/docker-tests/github.cuda.yml index 8c04da8a91a4f..d03b3657afc53 100644 --- a/dev/tasks/docker-tests/github.cuda.yml +++ b/dev/tasks/docker-tests/github.cuda.yml @@ -38,6 +38,7 @@ jobs: env: {{ macros.github_set_sccache_envvars()|indent(8) }} run: | + source arrow/ci/scripts/util_enable_core_dumps.sh archery docker run \ -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \ {{ flags|default("") }} \ diff --git a/dev/tasks/docker-tests/github.linux.yml b/dev/tasks/docker-tests/github.linux.yml index 28d3203c1ed48..cd2923a50d6df 100644 --- a/dev/tasks/docker-tests/github.linux.yml +++ b/dev/tasks/docker-tests/github.linux.yml @@ -38,6 +38,7 @@ jobs: run: | # GH-40558: reduce ASLR to avoid TSAN crashing sudo sysctl -w vm.mmap_rnd_bits=28 + source arrow/ci/scripts/util_enable_core_dumps.sh archery docker run \ -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \ {{ flags|default("") }} \ diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml index f9df27ba3175b..d9dbef82a948e 100644 --- a/dev/tasks/python-wheels/github.linux.yml +++ b/dev/tasks/python-wheels/github.linux.yml @@ -59,6 +59,7 @@ jobs: - name: Test wheel shell: bash run: | + source arrow/ci/scripts/util_enable_core_dumps.sh archery docker run python-wheel-manylinux-test-imports archery docker run python-wheel-manylinux-test-unittests diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index 0539eae6cc9d9..db6955b92d1e0 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -140,8 +140,7 @@ jobs: UBUNTU: {{ '"${{ matrix.ubuntu }}"' }} {{ macros.github_set_sccache_envvars()|indent(8) }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run \ -e EXTRA_CMAKE_FLAGS="{{ '${{ matrix.extra-cmake-flags }}' }}" \ {{ '${{ matrix.os }}' }}-cpp-static diff --git a/docker-compose.yml b/docker-compose.yml index 8721eef524a19..6d9b738d8da35 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -38,11 +38,11 @@ # WARNING: setting this will affect the host machine. # # Linux host: -# $ sudo sysctl -w kernel.core_pattern=core.%e.%p +# $ sudo sysctl -w kernel.core_pattern=/tmp/core.%e.%p # # macOS host running Docker for Mac (won't persist between restarts): # $ screen ~/Library/Containers/com.docker.docker/Data/vms/0/tty -# # echo "core.%e.%p" > /proc/sys/kernel/core_pattern +# # echo "/tmp/core.%e.%p" > /proc/sys/kernel/core_pattern # # The setup attempts to generate coredumps by default, but the correct paths # above must be set. In order to disable the coredump generation set From 395ce0752d159fc4bf097429dde488365fd87c74 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Wed, 11 Sep 2024 06:21:41 +0530 Subject: [PATCH 111/130] GH-44044: [Java] Consider warnings as errors for Vector Module (#44045) ### Rationale for this change This PR configs the build such that warnings are considered errors in the Vector module. And corresponding code changes have also been made. ### What changes are included in this PR? Adding flags to consider warnings as errors in javac and fixing the corresponding errors. ### Are these changes tested? Tested by existing test cases. ### Are there any user-facing changes? N/A * GitHub Issue: #44044 Authored-by: Vibhatha Lakmal Abeykoon Signed-off-by: David Li --- java/vector/pom.xml | 9 +++++++++ java/vector/src/main/codegen/templates/BaseWriter.java | 1 + 2 files changed, 10 insertions(+) diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 73d76fc7306ae..eb0e39565332e 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -118,6 +118,15 @@ under the License. + + org.apache.maven.plugins + maven-compiler-plugin + + + -Werror + + + org.apache.drill.tools diff --git a/java/vector/src/main/codegen/templates/BaseWriter.java b/java/vector/src/main/codegen/templates/BaseWriter.java index 458a4df1eec82..e952d46f1f241 100644 --- a/java/vector/src/main/codegen/templates/BaseWriter.java +++ b/java/vector/src/main/codegen/templates/BaseWriter.java @@ -125,6 +125,7 @@ public interface StructOrListWriter { /** * @deprecated use {@link #listOfStruct()} instead. */ + @Deprecated StructOrListWriter listoftstruct(String name); StructOrListWriter listOfStruct(String name); StructOrListWriter list(String name); From 0a4d5c1b2813d0bd5b07ac667ec6b17a00c2e44c Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Wed, 11 Sep 2024 06:49:33 +0530 Subject: [PATCH 112/130] GH-43962: [Java] Consider warnings as errors for Adapter Module (#43963) ### Rationale for this change This PR configs the build such that warnings are considered as errors in the Adapter module. And corresponding code changes have also been made. ### What changes are included in this PR? Adding flags to consider warnings as errors in javac and fixing the corresponding errors. ### Are these changes tested? Tested by existing test cases. ### Are there any user-facing changes? N/A * GitHub Issue: #43962 Authored-by: Vibhatha Lakmal Abeykoon Signed-off-by: David Li --- java/adapter/avro/pom.xml | 14 ++++++++++++++ .../adapter/avro/AvroToArrowIteratorTest.java | 1 + java/adapter/jdbc/pom.xml | 9 +++++++++ .../adapter/jdbc/h2/JdbcToArrowCharSetTest.java | 1 + .../adapter/jdbc/h2/JdbcToArrowDataTypesTest.java | 1 + .../jdbc/h2/JdbcToArrowMapDataTypeTest.java | 1 + .../arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java | 1 + .../jdbc/h2/JdbcToArrowOptionalColumnsTest.java | 1 + .../arrow/adapter/jdbc/h2/JdbcToArrowTest.java | 1 + .../adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java | 1 + .../jdbc/h2/JdbcToArrowVectorIteratorTest.java | 1 + java/adapter/orc/pom.xml | 9 +++++++++ 12 files changed, 41 insertions(+) diff --git a/java/adapter/avro/pom.xml b/java/adapter/avro/pom.xml index cb4adccb76771..2c02e72e9c838 100644 --- a/java/adapter/avro/pom.xml +++ b/java/adapter/avro/pom.xml @@ -56,4 +56,18 @@ under the License. ${dep.avro.version}
+ + + + + org.apache.maven.plugins + maven-compiler-plugin + + + -Werror + + + + + diff --git a/java/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/AvroToArrowIteratorTest.java b/java/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/AvroToArrowIteratorTest.java index f8022a9385134..44ccbc74511dd 100644 --- a/java/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/AvroToArrowIteratorTest.java +++ b/java/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/AvroToArrowIteratorTest.java @@ -50,6 +50,7 @@ public class AvroToArrowIteratorTest extends AvroTestBase { @BeforeEach + @Override public void init() { final BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); this.config = new AvroToArrowConfigBuilder(allocator).setTargetBatchSize(3).build(); diff --git a/java/adapter/jdbc/pom.xml b/java/adapter/jdbc/pom.xml index 099798a95cd25..5ebb4089cf72f 100644 --- a/java/adapter/jdbc/pom.xml +++ b/java/adapter/jdbc/pom.xml @@ -116,6 +116,15 @@ under the License. --add-reads=org.apache.arrow.adapter.jdbc=com.fasterxml.jackson.dataformat.yaml --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED -Duser.timezone=UTC + + org.apache.maven.plugins + maven-compiler-plugin + + + -Werror + + + diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java index 726e1905c4242..39c0085603f17 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java @@ -91,6 +91,7 @@ public static Stream getTestData() */ @ParameterizedTest @MethodSource("getTestData") + @Override public void testJdbcToArrowValues(Table table) throws SQLException, IOException, ClassNotFoundException { this.initializeDatabase(table); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java index c246bb2bec47e..2274f51745973 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java @@ -145,6 +145,7 @@ public static Stream getTestData() /** Test Method to test JdbcToArrow Functionality for various H2 DB based datatypes. */ @ParameterizedTest @MethodSource("getTestData") + @Override public void testJdbcToArrowValues(Table table) throws SQLException, IOException, ClassNotFoundException { this.initializeDatabase(table); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java index 1daeda6772b26..456d338f6bd75 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java @@ -45,6 +45,7 @@ public static Stream getTestData() throws IOException { /** Test Method to test JdbcToArrow Functionality for Map form Types.OTHER column. */ @ParameterizedTest @MethodSource("getTestData") + @Override public void testJdbcToArrowValues(Table table) throws SQLException, IOException, ClassNotFoundException { this.initializeDatabase(table); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java index 205b7e16f2f09..2009268980afe 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java @@ -113,6 +113,7 @@ public static Stream getTestData() */ @ParameterizedTest @MethodSource("getTestData") + @Override public void testJdbcToArrowValues(Table table) throws SQLException, IOException, ClassNotFoundException { this.initializeDatabase(table); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java index 382d20f45d4b1..2108afec4c945 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java @@ -59,6 +59,7 @@ public static Stream getTestData() */ @ParameterizedTest @MethodSource("getTestData") + @Override public void testJdbcToArrowValues(Table table) throws SQLException, IOException, ClassNotFoundException { this.initializeDatabase(table); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java index 7966f62e175e3..bea7d4d37c50e 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java @@ -87,6 +87,7 @@ public static Stream getTestData() */ @ParameterizedTest @MethodSource("getTestData") + @Override public void testJdbcToArrowValues(Table table) throws SQLException, IOException, ClassNotFoundException { this.initializeDatabase(table); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java index 0f60c89d1c03c..14396997d2863 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java @@ -91,6 +91,7 @@ public static Stream getTestData() */ @ParameterizedTest @MethodSource("getTestData") + @Override public void testJdbcToArrowValues(Table table) throws SQLException, IOException, ClassNotFoundException { this.initializeDatabase(table); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java index 40fd39ac0c555..de9eff327ef6f 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java @@ -76,6 +76,7 @@ public class JdbcToArrowVectorIteratorTest extends JdbcToArrowTest { @ParameterizedTest @MethodSource("getTestData") + @Override public void testJdbcToArrowValues(Table table) throws SQLException, IOException, ClassNotFoundException { this.initializeDatabase(table); diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml index d9cd2bb21a526..cf35397c9917b 100644 --- a/java/adapter/orc/pom.xml +++ b/java/adapter/orc/pom.xml @@ -160,6 +160,15 @@ under the License. + + org.apache.maven.plugins + maven-compiler-plugin + + + -Werror + + + From c53f430c76d6ea49e11d3281be147bdd20d9f9f8 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 11 Sep 2024 11:02:26 +0900 Subject: [PATCH 113/130] GH-44006: [GLib][Parquet] Add `gparquet_arrow_file_writer_new_row_group()` (#44039) ### Rationale for this change This is a low-level API to control how to write data. This is for advanced users. ### What changes are included in this PR? `gparquet_arrow_file_writer_write_chunked_array()` is also added to write a test for `gparquet_arrow_file_writer_new_row_group()`. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #44006 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/parquet-glib/arrow-file-writer.cpp | 50 +++++++++++++++++-- c_glib/parquet-glib/arrow-file-writer.h | 14 +++++- c_glib/test/parquet/test-arrow-file-writer.rb | 30 +++++++++++ 3 files changed, 90 insertions(+), 4 deletions(-) diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp index 0d0e87e7e3ede..7a672f1f21dcc 100644 --- a/c_glib/parquet-glib/arrow-file-writer.cpp +++ b/c_glib/parquet-glib/arrow-file-writer.cpp @@ -548,13 +548,57 @@ gparquet_arrow_file_writer_write_record_batch(GParquetArrowFileWriter *writer, gboolean gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, GArrowTable *table, - guint64 chunk_size, + gsize chunk_size, GError **error) { auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); auto arrow_table = garrow_table_get_raw(table).get(); - auto status = parquet_arrow_file_writer->WriteTable(*arrow_table, chunk_size); - return garrow_error_check(error, status, "[parquet][arrow][file-writer][write-table]"); + return garrow::check(error, + parquet_arrow_file_writer->WriteTable(*arrow_table, chunk_size), + "[parquet][arrow][file-writer][write-table]"); +} + +/** + * gparquet_arrow_file_writer_new_row_group: + * @writer: A #GParquetArrowFileWriter. + * @chunk_size: The max number of rows in a row group. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, + gsize chunk_size, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + return garrow::check(error, + parquet_arrow_file_writer->NewRowGroup(chunk_size), + "[parquet][arrow][file-writer][new-row-group]"); +} + +/** + * gparquet_arrow_file_writer_write_chunked_array: + * @writer: A #GParquetArrowFileWriter. + * @chunked_array: A #GArrowChunkedArray to be written. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer, + GArrowChunkedArray *chunked_array, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); + return garrow::check(error, + parquet_arrow_file_writer->WriteColumnChunk(arrow_chunked_array), + "[parquet][arrow][file-writer][write-chunked-array]"); } /** diff --git a/c_glib/parquet-glib/arrow-file-writer.h b/c_glib/parquet-glib/arrow-file-writer.h index 7eb14fe27a8bf..40595bdfef4b9 100644 --- a/c_glib/parquet-glib/arrow-file-writer.h +++ b/c_glib/parquet-glib/arrow-file-writer.h @@ -130,9 +130,21 @@ GPARQUET_AVAILABLE_IN_0_11 gboolean gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, GArrowTable *table, - guint64 chunk_size, + gsize chunk_size, GError **error); +GPARQUET_AVAILABLE_IN_18_0 +gboolean +gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, + gsize chunk_size, + GError **error); + +GPARQUET_AVAILABLE_IN_18_0 +gboolean +gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer, + GArrowChunkedArray *chunked_array, + GError **error); + GPARQUET_AVAILABLE_IN_0_11 gboolean gparquet_arrow_file_writer_close(GParquetArrowFileWriter *writer, GError **error); diff --git a/c_glib/test/parquet/test-arrow-file-writer.rb b/c_glib/test/parquet/test-arrow-file-writer.rb index e348c9b679524..89db16c6fb90b 100644 --- a/c_glib/test/parquet/test-arrow-file-writer.rb +++ b/c_glib/test/parquet/test-arrow-file-writer.rb @@ -82,4 +82,34 @@ def test_write_table reader.unref end end + + def test_write_chunked_array + schema = build_schema("enabled" => :boolean) + writer = Parquet::ArrowFileWriter.new(schema, @file.path) + writer.new_row_group(2) + chunked_array = Arrow::ChunkedArray.new([build_boolean_array([true, nil])]) + writer.write_chunked_array(chunked_array) + writer.new_row_group(1) + chunked_array = Arrow::ChunkedArray.new([build_boolean_array([false])]) + writer.write_chunked_array(chunked_array) + writer.close + + reader = Parquet::ArrowFileReader.new(@file.path) + begin + reader.use_threads = true + assert_equal([ + 2, + build_table("enabled" => [ + build_boolean_array([true, nil]), + build_boolean_array([false]), + ]), + ], + [ + reader.n_row_groups, + reader.read_table, + ]) + ensure + reader.unref + end + end end From e4a6f1e38df860c7887e9ad823ea88320b70c90b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 11 Sep 2024 15:21:24 +0900 Subject: [PATCH 114/130] GH-44050: [CI][Integration] Execute integration test again (#44051) ### Rationale for this change `>` in YAML removes newlines. ### What changes are included in this PR? Use `|` instead of `>` to keep newlines. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #44050 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 2d19b1e59b27a..b73f900e616f5 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -100,7 +100,7 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: > + run: | source ci/scripts/util_enable_core_dumps.sh archery docker run \ -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \ From 8d5a7751c089c413e6e2421dd905158b4990320b Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 11 Sep 2024 04:26:48 -0400 Subject: [PATCH 115/130] GH-43973: [Python] Table fails gracefully on non-cpu devices (#43974) ## Rationale for this change Table APIs should throw python exception instead of segfault if they don't support operating on non-cpu memory. ### What changes are included in this PR? * Add is_cpu() property to Table * Add _assert_cpu() checks to Table APIs that only support operating on cpu memory ### Are these changes tested? * Unit tests ### Are there any user-facing changes? No, besides receiving a friendlier error in certain scenarios. * GitHub Issue: #43973 Lead-authored-by: Dane Pitkin Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/lib.pxd | 2 + python/pyarrow/table.pxi | 30 ++++ python/pyarrow/tests/test_table.py | 244 +++++++++++++++++++++++++++-- 3 files changed, 265 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 1caf58e20e653..25a7945dc3ddc 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -525,6 +525,8 @@ cdef class Table(_Tabular): cdef: shared_ptr[CTable] sp_table CTable* table + c_bool _is_cpu + c_bool _init_is_cpu cdef void init(self, const shared_ptr[CTable]& table) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 3b0df981e017c..819bbc34c66b9 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -4180,6 +4180,7 @@ cdef class Table(_Tabular): def __cinit__(self): self.table = NULL + self._init_is_cpu = False cdef void init(self, const shared_ptr[CTable]& table): self.sp_table = table @@ -4205,6 +4206,7 @@ cdef class Table(_Tabular): ArrowInvalid """ if full: + self._assert_cpu() with nogil: check_status(self.table.ValidateFull()) else: @@ -4214,6 +4216,7 @@ cdef class Table(_Tabular): def __reduce__(self): # Reduce the columns as ChunkedArrays to avoid serializing schema # data twice + self._assert_cpu() columns = [col for col in self.columns] return _reconstruct_table, (columns, self.schema) @@ -4452,6 +4455,7 @@ cdef class Table(_Tabular): a.year: [[null,2022]] month: [[4,6]] """ + self._assert_cpu() cdef: shared_ptr[CTable] flattened CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) @@ -4499,6 +4503,7 @@ cdef class Table(_Tabular): n_legs: [[2,2,4,4,5,100]] animals: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] """ + self._assert_cpu() cdef: shared_ptr[CTable] combined CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) @@ -4556,6 +4561,7 @@ cdef class Table(_Tabular): ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: [3,4,5]] """ + self._assert_cpu() cdef: CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) shared_ptr[CTable] c_result @@ -4601,6 +4607,7 @@ cdef class Table(_Tabular): >>> table.equals(table_1, check_metadata=True) False """ + self._assert_cpu() if other is None: return False @@ -4658,6 +4665,7 @@ cdef class Table(_Tabular): n_legs: [[2,4,5,100]] animals: [["Flamingo","Horse","Brittle stars","Centipede"]] """ + self._assert_cpu() cdef: ChunkedArray column, casted Field field @@ -4909,6 +4917,7 @@ cdef class Table(_Tabular): ------- ChunkedArray """ + self._assert_cpu() return chunked_array([ batch.to_struct_array() for batch in self.to_batches(max_chunksize=max_chunksize) @@ -5118,6 +5127,7 @@ cdef class Table(_Tabular): def _to_pandas(self, options, categories=None, ignore_metadata=False, types_mapper=None): + self._assert_cpu() from pyarrow.pandas_compat import table_to_dataframe df = table_to_dataframe( options, self, categories, @@ -5239,6 +5249,7 @@ cdef class Table(_Tabular): >>> table.nbytes 72 """ + self._assert_cpu() cdef: CResult[int64_t] c_res_buffer @@ -5268,6 +5279,7 @@ cdef class Table(_Tabular): >>> table.get_total_buffer_size() 76 """ + self._assert_cpu() cdef: int64_t total_buffer_size @@ -5576,6 +5588,7 @@ cdef class Table(_Tabular): year: [[2020,2022,2021,2019]] n_legs_sum: [[2,6,104,5]] """ + self._assert_cpu() return TableGroupBy(self, keys, use_threads=use_threads) def join(self, right_table, keys, right_keys=None, join_type="left outer", @@ -5685,6 +5698,7 @@ cdef class Table(_Tabular): n_legs: [[100]] animal: [["Centipede"]] """ + self._assert_cpu() if right_keys is None: right_keys = keys return _pac()._perform_join( @@ -5772,6 +5786,7 @@ cdef class Table(_Tabular): n_legs: [[null,5,null,5,null]] animal: [[null,"Brittle stars",null,"Brittle stars",null]] """ + self._assert_cpu() if right_on is None: right_on = on if right_by is None: @@ -5797,8 +5812,23 @@ cdef class Table(_Tabular): ------- PyCapsule """ + self._assert_cpu() return self.to_reader().__arrow_c_stream__(requested_schema) + @property + def is_cpu(self): + """ + Whether all ChunkedArrays are CPU-accessible. + """ + if not self._init_is_cpu: + self._is_cpu = all(c.is_cpu for c in self.itercolumns()) + self._init_is_cpu = True + return self._is_cpu + + cdef void _assert_cpu(self) except *: + if not self.is_cpu: + raise NotImplementedError("Implemented only for data on CPU device") + def _reconstruct_table(arrays, schema): """ diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index c3f805b4b32d6..b66a5eb083cc5 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3430,6 +3430,21 @@ def cuda_recordbatch(cuda_context, cpu_recordbatch): return cpu_recordbatch.copy_to(cuda_context.memory_manager) +@pytest.fixture +def cpu_table(schema, cpu_chunked_array): + return pa.table([cpu_chunked_array, cpu_chunked_array], schema=schema) + + +@pytest.fixture +def cuda_table(schema, cuda_chunked_array): + return pa.table([cuda_chunked_array, cuda_chunked_array], schema=schema) + + +@pytest.fixture +def cpu_and_cuda_table(schema, cpu_chunked_array, cuda_chunked_array): + return pa.table([cpu_chunked_array, cuda_chunked_array], schema=schema) + + def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_array, cpu_and_cuda_chunked_array): # type test @@ -3586,6 +3601,9 @@ def verify_cuda_recordbatch(batch, expected_schema): def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch, cuda_arrays, schema): verify_cuda_recordbatch(cuda_recordbatch, expected_schema=schema) + N = cuda_recordbatch.num_rows + + # shape test assert cuda_recordbatch.shape == (5, 2) # columns() test @@ -3593,24 +3611,26 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch, # add_column(), set_column() test for fn in [cuda_recordbatch.add_column, cuda_recordbatch.set_column]: - col = pa.array([6, 7, 8, 9, 10], pa.int8()).copy_to(cuda_context.memory_manager) + col = pa.array([-2, -1, 0, 1, 2], pa.int8() + ).copy_to(cuda_context.memory_manager) new_batch = fn(2, 'c2', col) - assert len(new_batch.columns) == 3 - for c in new_batch.columns: - assert c.device_type == pa.DeviceAllocationType.CUDA + verify_cuda_recordbatch( + new_batch, expected_schema=schema.append(pa.field('c2', pa.int8()))) err_msg = ("Got column on device , " "but expected .") with pytest.raises(TypeError, match=err_msg): - fn(2, 'c2', [1, 1, 1, 1, 1]) + fn(2, 'c2', [1] * N) # remove_column() test new_batch = cuda_recordbatch.remove_column(1) verify_cuda_recordbatch(new_batch, expected_schema=schema.remove(1)) # drop_columns() test - new_batch = cuda_recordbatch.drop_columns(['c0', 'c1']) - assert len(new_batch.columns) == 0 - assert new_batch.device_type == pa.DeviceAllocationType.CUDA + new_batch = cuda_recordbatch.drop_columns(['c1']) + verify_cuda_recordbatch(new_batch, expected_schema=schema.remove(1)) + empty_batch = cuda_recordbatch.drop_columns(['c0', 'c1']) + assert len(empty_batch.columns) == 0 + assert empty_batch.device_type == pa.DeviceAllocationType.CUDA # select() test new_batch = cuda_recordbatch.select(['c0']) @@ -3622,8 +3642,7 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch, cuda_recordbatch.cast(new_schema) # drop_null() test - null_col = pa.array([-2, -1, 0, 1, 2], - mask=[True, False, True, False, True]).copy_to( + null_col = pa.array([1] * N, mask=[True, False, True, False, True]).copy_to( cuda_context.memory_manager) cuda_recordbatch_with_nulls = cuda_recordbatch.add_column(2, 'c2', null_col) with pytest.raises(NotImplementedError): @@ -3631,7 +3650,7 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch, # filter() test with pytest.raises(NotImplementedError): - cuda_recordbatch.filter([True] * 5) + cuda_recordbatch.filter([True] * N) # take() test with pytest.raises(NotImplementedError): @@ -3737,3 +3756,206 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch, # __dataframe__() test with pytest.raises(NotImplementedError): from_dataframe(cuda_recordbatch.__dataframe__()) + + +def verify_cuda_table(table, expected_schema): + table.validate() + assert table.is_cpu is False + assert table.num_columns == len(expected_schema.names) + assert table.column_names == expected_schema.names + assert str(table) in repr(table) + for c in table.columns: + assert c.is_cpu is False + for chunk in c.iterchunks(): + assert chunk.is_cpu is False + assert chunk.device_type == pa.DeviceAllocationType.CUDA + assert table.schema == expected_schema + + +def test_table_non_cpu(cuda_context, cpu_table, cuda_table, + cuda_arrays, cuda_recordbatch, schema): + verify_cuda_table(cuda_table, expected_schema=schema) + N = cuda_table.num_rows + + # shape test + assert cuda_table.shape == (10, 2) + + # columns() test + assert len(cuda_table.columns) == 2 + + # add_column(), set_column() test + for fn in [cuda_table.add_column, cuda_table.set_column]: + cpu_col = pa.array([1] * N, pa.int8()) + cuda_col = cpu_col.copy_to(cuda_context.memory_manager) + new_table = fn(2, 'c2', cuda_col) + verify_cuda_table(new_table, expected_schema=schema.append( + pa.field('c2', pa.int8()))) + new_table = fn(2, 'c2', cpu_col) + assert new_table.is_cpu is False + assert new_table.column(0).is_cpu is False + assert new_table.column(1).is_cpu is False + assert new_table.column(2).is_cpu is True + + # remove_column() test + new_table = cuda_table.remove_column(1) + verify_cuda_table(new_table, expected_schema=schema.remove(1)) + + # drop_columns() test + new_table = cuda_table.drop_columns(['c1']) + verify_cuda_table(new_table, expected_schema=schema.remove(1)) + new_table = cuda_table.drop_columns(['c0', 'c1']) + assert len(new_table.columns) == 0 + assert new_table.is_cpu + + # select() test + new_table = cuda_table.select(['c0']) + verify_cuda_table(new_table, expected_schema=schema.remove(1)) + + # cast() test + new_schema = pa.schema([pa.field('c0', pa.int64()), pa.field('c1', pa.int64())]) + with pytest.raises(NotImplementedError): + cuda_table.cast(new_schema) + + # drop_null() test + null_col = pa.array([1] * N, mask=[True] * N).copy_to(cuda_context.memory_manager) + cuda_table_with_nulls = cuda_table.add_column(2, 'c2', null_col) + with pytest.raises(NotImplementedError): + cuda_table_with_nulls.drop_null() + + # filter() test + with pytest.raises(NotImplementedError): + cuda_table.filter([True] * N) + + # take() test + with pytest.raises(NotImplementedError): + cuda_table.take([0]) + + # sort_by() test + with pytest.raises(NotImplementedError): + cuda_table.sort_by('c0') + + # field() test + assert cuda_table.field(0) == schema.field(0) + assert cuda_table.field(1) == schema.field(1) + + # equals() test + with pytest.raises(NotImplementedError): + assert cuda_table.equals(cpu_table) + + # from_arrays() test + new_table = pa.Table.from_arrays(cuda_arrays, ['c0', 'c1']) + verify_cuda_table(new_table, expected_schema=schema) + + # from_pydict() test + new_table = pa.Table.from_pydict({'c0': cuda_arrays[0], 'c1': cuda_arrays[1]}) + verify_cuda_table(new_table, expected_schema=schema) + + # from_struct_array() test + fields = [schema.field(i) for i in range(len(schema.names))] + struct_array = pa.StructArray.from_arrays(cuda_arrays, fields=fields) + with pytest.raises(NotImplementedError): + pa.Table.from_struct_array(struct_array) + + # from_batches() test + new_table = pa.Table.from_batches([cuda_recordbatch, cuda_recordbatch], schema) + verify_cuda_table(new_table, expected_schema=schema) + + # nbytes test + with pytest.raises(NotImplementedError): + assert cuda_table.nbytes + + # get_total_buffer_size() test + with pytest.raises(NotImplementedError): + assert cuda_table.get_total_buffer_size() + + # to_pydict() test + with pytest.raises(NotImplementedError): + cuda_table.to_pydict() + + # to_pylist() test + with pytest.raises(NotImplementedError): + cuda_table.to_pylist() + + # to_pandas() test + with pytest.raises(NotImplementedError): + cuda_table.to_pandas() + + # to_struct_array() test + with pytest.raises(NotImplementedError): + cuda_table.to_struct_array() + + # to_batches() test + batches = cuda_table.to_batches(max_chunksize=5) + for batch in batches: + # GH-44049 + with pytest.raises(AssertionError): + verify_cuda_recordbatch(batch, expected_schema=schema) + + # to_reader() test + reader = cuda_table.to_reader(max_chunksize=5) + for batch in reader: + # GH-44049 + with pytest.raises(AssertionError): + verify_cuda_recordbatch(batch, expected_schema=schema) + + # slice() test + new_table = cuda_table.slice(1, 3) + verify_cuda_table(new_table, expected_schema=schema) + assert new_table.num_rows == 3 + + # replace_schema_metadata() test + new_table = cuda_table.replace_schema_metadata({b'key': b'value'}) + verify_cuda_table(new_table, expected_schema=schema) + assert new_table.schema.metadata == {b'key': b'value'} + + # rename_columns() test + new_table = cuda_table.rename_columns(['col0', 'col1']) + expected_schema = pa.schema( + [pa.field('col0', schema.field(0).type), + pa.field('col1', schema.field(1).type)]) + verify_cuda_table(new_table, expected_schema=expected_schema) + + # validate() test + cuda_table.validate() + with pytest.raises(NotImplementedError): + cuda_table.validate(full=True) + + # flatten() test + with pytest.raises(NotImplementedError): + cuda_table.flatten() + + # combine_chunks() test + with pytest.raises(NotImplementedError): + cuda_table.flatten() + + # unify_dictionaries() test + with pytest.raises(NotImplementedError): + cuda_table.unify_dictionaries() + + # group_by() test + with pytest.raises(NotImplementedError): + cuda_table.group_by('c0') + + # join() test + with pytest.raises(NotImplementedError): + cuda_table.join(cuda_table, 'c0') + + # join_asof() test + with pytest.raises(NotImplementedError): + cuda_table.join_asof(cuda_table, 'c0', 'c0', 0) + + # __array__() test + with pytest.raises(NotImplementedError): + cuda_table.__array__() + + # __arrow_c_stream__() test + with pytest.raises(NotImplementedError): + cuda_table.__arrow_c_stream__() + + # __dataframe__() test + with pytest.raises(NotImplementedError): + from_dataframe(cuda_table.__dataframe__()) + + # __reduce__() test + with pytest.raises(NotImplementedError): + cuda_table.__reduce__() From d4b38fd94cbe77e618e9d23cd73aff37161a04f7 Mon Sep 17 00:00:00 2001 From: Pradeep Gollakota Date: Wed, 11 Sep 2024 03:45:08 -0700 Subject: [PATCH 116/130] GH-32538: [C++][Parquet] Add JSON canonical extension type (#13901) Arrow now provides a canonical extension type for JSON data. This extension is backed by utf8(). Parquet will recognize this extension and appropriately propagate the LogicalType to the storage format. * GitHub Issue: #32538 Lead-authored-by: Rok Mihevc Co-authored-by: Pradeep Gollakota Co-authored-by: Antoine Pitrou Co-authored-by: mwish Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array/validate.cc | 20 +++- cpp/src/arrow/extension/CMakeLists.txt | 2 +- .../extension/fixed_shape_tensor_test.cc | 6 +- cpp/src/arrow/extension/json.cc | 61 ++++++++++++ cpp/src/arrow/extension/json.h | 56 +++++++++++ cpp/src/arrow/extension/json_test.cc | 83 ++++++++++++++++ cpp/src/arrow/extension/uuid_test.cc | 4 +- cpp/src/arrow/extension_type.cc | 4 +- cpp/src/arrow/extension_type_test.cc | 6 +- cpp/src/arrow/ipc/test_common.cc | 17 ++-- cpp/src/arrow/ipc/test_common.h | 4 +- cpp/src/arrow/testing/gtest_util.cc | 1 + .../parquet/arrow/arrow_reader_writer_test.cc | 61 +++++++++++- cpp/src/parquet/arrow/arrow_schema_test.cc | 94 ++++++++++++++++++- cpp/src/parquet/arrow/schema.cc | 46 ++++++--- cpp/src/parquet/arrow/schema_internal.cc | 24 +++-- cpp/src/parquet/arrow/schema_internal.h | 8 +- cpp/src/parquet/properties.h | 16 +++- docs/source/status.rst | 2 +- 20 files changed, 460 insertions(+), 56 deletions(-) create mode 100644 cpp/src/arrow/extension/json.cc create mode 100644 cpp/src/arrow/extension/json.h create mode 100644 cpp/src/arrow/extension/json_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 01ac813f4713b..e77a02d0c0800 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -376,6 +376,7 @@ set(ARROW_SRCS device_allocation_type_set.cc extension_type.cc extension/bool8.cc + extension/json.cc extension/uuid.cc pretty_print.cc record_batch.cc diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 0d940d3bc869e..69f1646054f4c 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -985,10 +985,22 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d ARROW_EXPORT Status ValidateUTF8(const ArrayData& data) { - DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW || - data.type->id() == Type::LARGE_STRING); - UTF8DataValidator validator{data}; - return VisitTypeInline(*data.type, &validator); + const auto& storage_type = + (data.type->id() == Type::EXTENSION) + ? checked_cast(*data.type).storage_type() + : data.type; + DCHECK(storage_type->id() == Type::STRING || storage_type->id() == Type::STRING_VIEW || + storage_type->id() == Type::LARGE_STRING); + + if (data.type->id() == Type::EXTENSION) { + ArrayData ext_data(data); + ext_data.type = storage_type; + UTF8DataValidator validator{ext_data}; + return VisitTypeInline(*storage_type, &validator); + } else { + UTF8DataValidator validator{data}; + return VisitTypeInline(*storage_type, &validator); + } } ARROW_EXPORT diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 065ea3f1ddb16..4ab6a35b52e4f 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc) +set(CANONICAL_EXTENSION_TESTS bool8_test.cc json_test.cc uuid_test.cc) if(ARROW_JSON) list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 842a78e1a4f7a..51aea4b25fdda 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -205,7 +205,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { std::shared_ptr read_batch; auto ext_field = field(/*name=*/"f0", /*type=*/ext_type_); auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); - RoundtripBatch(batch, &read_batch); + ASSERT_OK(RoundtripBatch(batch, &read_batch)); CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); // Pass extension metadata and storage array, expect getting back extension array @@ -216,7 +216,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { ext_field = field(/*name=*/"f0", /*type=*/element_type_, /*nullable=*/true, /*metadata=*/ext_metadata); auto batch2 = RecordBatch::Make(schema({ext_field}), fsla_arr->length(), {fsla_arr}); - RoundtripBatch(batch2, &read_batch2); + ASSERT_OK(RoundtripBatch(batch2, &read_batch2)); CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); } @@ -469,7 +469,7 @@ TEST_F(TestExtensionType, RoundtripBatchFromTensor) { auto ext_field = field("f0", ext_type_, true, ext_metadata); auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); std::shared_ptr read_batch; - RoundtripBatch(batch, &read_batch); + ASSERT_OK(RoundtripBatch(batch, &read_batch)); CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); } diff --git a/cpp/src/arrow/extension/json.cc b/cpp/src/arrow/extension/json.cc new file mode 100644 index 0000000000000..d793233c2b573 --- /dev/null +++ b/cpp/src/arrow/extension/json.cc @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/json.h" + +#include + +#include "arrow/extension_type.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/logging.h" + +namespace arrow::extension { + +bool JsonExtensionType::ExtensionEquals(const ExtensionType& other) const { + return other.extension_name() == this->extension_name(); +} + +Result> JsonExtensionType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized) const { + if (storage_type->id() != Type::STRING && storage_type->id() != Type::STRING_VIEW && + storage_type->id() != Type::LARGE_STRING) { + return Status::Invalid("Invalid storage type for JsonExtensionType: ", + storage_type->ToString()); + } + return std::make_shared(storage_type); +} + +std::string JsonExtensionType::Serialize() const { return ""; } + +std::shared_ptr JsonExtensionType::MakeArray( + std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.json", + internal::checked_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +std::shared_ptr json(const std::shared_ptr storage_type) { + ARROW_CHECK(storage_type->id() != Type::STRING || + storage_type->id() != Type::STRING_VIEW || + storage_type->id() != Type::LARGE_STRING); + return std::make_shared(storage_type); +} + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/json.h b/cpp/src/arrow/extension/json.h new file mode 100644 index 0000000000000..4793ab2bc9b36 --- /dev/null +++ b/cpp/src/arrow/extension/json.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/extension_type.h" +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow::extension { + +/// \brief Concrete type class for variable-size JSON data, utf8-encoded. +class ARROW_EXPORT JsonExtensionType : public ExtensionType { + public: + explicit JsonExtensionType(const std::shared_ptr& storage_type) + : ExtensionType(storage_type), storage_type_(storage_type) {} + + std::string extension_name() const override { return "arrow.json"; } + + bool ExtensionEquals(const ExtensionType& other) const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + std::string Serialize() const override; + + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + private: + std::shared_ptr storage_type_; +}; + +/// \brief Return a JsonExtensionType instance. +ARROW_EXPORT std::shared_ptr json( + std::shared_ptr storage_type = utf8()); + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/json_test.cc b/cpp/src/arrow/extension/json_test.cc new file mode 100644 index 0000000000000..143e4f9ceeac7 --- /dev/null +++ b/cpp/src/arrow/extension/json_test.cc @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/json.h" + +#include "arrow/array/validate.h" +#include "arrow/ipc/test_common.h" +#include "arrow/record_batch.h" +#include "arrow/testing/gtest_util.h" +#include "parquet/exception.h" + +namespace arrow { + +using arrow::ipc::test::RoundtripBatch; +using extension::json; + +class TestJsonExtensionType : public ::testing::Test {}; + +std::shared_ptr ExampleJson(const std::shared_ptr& storage_type) { + std::shared_ptr arr = ArrayFromJSON(storage_type, R"([ + "null", + "1234", + "3.14159", + "true", + "false", + "\"a json string\"", + "[\"a\", \"json\", \"array\"]", + "{\"obj\": \"a simple json object\"}" + ])"); + return ExtensionType::WrapArray(arrow::extension::json(storage_type), arr); +} + +TEST_F(TestJsonExtensionType, JsonRoundtrip) { + for (const auto& storage_type : {utf8(), large_utf8(), utf8_view()}) { + std::shared_ptr ext_arr = ExampleJson(storage_type); + auto batch = + RecordBatch::Make(schema({field("f0", json(storage_type))}), 8, {ext_arr}); + + std::shared_ptr read_batch; + ASSERT_OK(RoundtripBatch(batch, &read_batch)); + ASSERT_OK(read_batch->ValidateFull()); + CompareBatch(*batch, *read_batch, /*compare_metadata*/ true); + + auto read_ext_arr = read_batch->column(0); + ASSERT_OK(internal::ValidateUTF8(*read_ext_arr)); + ASSERT_OK(read_ext_arr->ValidateFull()); + } +} + +TEST_F(TestJsonExtensionType, InvalidUTF8) { + for (const auto& storage_type : {utf8(), large_utf8(), utf8_view()}) { + auto json_type = json(storage_type); + auto invalid_input = ArrayFromJSON(storage_type, "[\"Ⱥa\xFFⱭ\", \"Ɽ\xe1\xbdⱤaA\"]"); + auto ext_arr = ExtensionType::WrapArray(json_type, invalid_input); + + ASSERT_RAISES_WITH_MESSAGE(Invalid, + "Invalid: Invalid UTF8 sequence at string index 0", + ext_arr->ValidateFull()); + ASSERT_RAISES_WITH_MESSAGE(Invalid, + "Invalid: Invalid UTF8 sequence at string index 0", + arrow::internal::ValidateUTF8(*ext_arr)); + + auto batch = RecordBatch::Make(schema({field("f0", json_type)}), 2, {ext_arr}); + std::shared_ptr read_batch; + ASSERT_OK(RoundtripBatch(batch, &read_batch)); + } +} + +} // namespace arrow diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc index 3bbb6eeb4aef1..1c1ffb6eb8e15 100644 --- a/cpp/src/arrow/extension/uuid_test.cc +++ b/cpp/src/arrow/extension/uuid_test.cc @@ -54,7 +54,7 @@ TEST(TestUuuidExtensionType, RoundtripBatch) { std::shared_ptr read_batch; auto ext_field = field(/*name=*/"f0", /*type=*/ext_type); auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); - RoundtripBatch(batch, &read_batch); + ASSERT_OK(RoundtripBatch(batch, &read_batch)); CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); // Pass extension metadata and storage array, expect getting back extension array @@ -65,7 +65,7 @@ TEST(TestUuuidExtensionType, RoundtripBatch) { ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(), /*nullable=*/true, /*metadata=*/ext_metadata); auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr}); - RoundtripBatch(batch2, &read_batch2); + ASSERT_OK(RoundtripBatch(batch2, &read_batch2)); CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); } diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index d0135e905a0c3..7ad39eab23f8d 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -32,6 +32,7 @@ # include "arrow/extension/fixed_shape_tensor.h" # include "arrow/extension/opaque.h" #endif +#include "arrow/extension/json.h" #include "arrow/extension/uuid.h" #include "arrow/status.h" #include "arrow/type.h" @@ -148,7 +149,8 @@ static void CreateGlobalRegistry() { // Register canonical extension types g_registry = std::make_shared(); - std::vector> ext_types{extension::bool8(), extension::uuid()}; + std::vector> ext_types{extension::bool8(), extension::json(), + extension::uuid()}; #ifdef ARROW_JSON ext_types.push_back(extension::fixed_shape_tensor(int64(), {})); diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index f49ffc5cba553..029d833b98cd8 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -219,14 +219,14 @@ TEST_F(TestExtensionType, IpcRoundtrip) { auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr}); std::shared_ptr read_batch; - RoundtripBatch(batch, &read_batch); + ASSERT_OK(RoundtripBatch(batch, &read_batch)); CompareBatch(*batch, *read_batch, false /* compare_metadata */); // Wrap type in a ListArray and ensure it also makes it auto offsets_arr = ArrayFromJSON(int32(), "[0, 0, 2, 4]"); ASSERT_OK_AND_ASSIGN(auto list_arr, ListArray::FromArrays(*offsets_arr, *ext_arr)); batch = RecordBatch::Make(schema({field("f0", list(uuid()))}), 3, {list_arr}); - RoundtripBatch(batch, &read_batch); + ASSERT_OK(RoundtripBatch(batch, &read_batch)); CompareBatch(*batch, *read_batch, false /* compare_metadata */); } @@ -289,7 +289,7 @@ TEST_F(TestExtensionType, ParametricTypes) { 4, {p1, p2, p3, p4}); std::shared_ptr read_batch; - RoundtripBatch(batch, &read_batch); + ASSERT_OK(RoundtripBatch(batch, &read_batch)); CompareBatch(*batch, *read_batch, false /* compare_metadata */); } diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index fb4f6bd8eadcf..e354e2f89b3b3 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -1236,18 +1236,19 @@ Status MakeRandomTensor(const std::shared_ptr& type, return Tensor::Make(type, buf, shape, strides).Value(out); } -void RoundtripBatch(const std::shared_ptr& batch, - std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); - ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), - out_stream.get())); +Status RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out) { + ARROW_ASSIGN_OR_RAISE(auto out_stream, io::BufferOutputStream::Create()); + RETURN_NOT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); - ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + ARROW_ASSIGN_OR_RAISE(auto complete_ipc_stream, out_stream->Finish()); io::BufferReader reader(complete_ipc_stream); std::shared_ptr batch_reader; - ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); - ASSERT_OK(batch_reader->ReadNext(out)); + ARROW_ASSIGN_OR_RAISE(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + RETURN_NOT_OK(batch_reader->ReadNext(out)); + return Status::OK(); } } // namespace test diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index 9b7e7f13e3a8e..189de288795c0 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -184,8 +184,8 @@ Status MakeRandomTensor(const std::shared_ptr& type, const std::vector& shape, bool row_major_p, std::shared_ptr* out, uint32_t seed = 0); -ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr& batch, - std::shared_ptr* out); +ARROW_TESTING_EXPORT Status RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out); } // namespace test } // namespace ipc diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index c4a7f363c71bc..07d15826f2c8f 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -49,6 +49,7 @@ #include "arrow/buffer.h" #include "arrow/compute/api_vector.h" #include "arrow/datum.h" +#include "arrow/extension/json.h" #include "arrow/io/memory.h" #include "arrow/ipc/json_simple.h" #include "arrow/ipc/reader.h" diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 724e6c44f2ed0..5d990a5c6bd4a 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -37,6 +37,7 @@ #include "arrow/array/builder_primitive.h" #include "arrow/chunked_array.h" #include "arrow/compute/api.h" +#include "arrow/extension/json.h" #include "arrow/io/api.h" #include "arrow/record_batch.h" #include "arrow/scalar.h" @@ -618,10 +619,15 @@ class ParquetIOTestBase : public ::testing::Test { return ParquetFileWriter::Open(sink_, schema); } - void ReaderFromSink(std::unique_ptr* out) { + void ReaderFromSink( + std::unique_ptr* out, + const ArrowReaderProperties& properties = default_arrow_reader_properties()) { ASSERT_OK_AND_ASSIGN(auto buffer, sink_->Finish()); - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), out)); + FileReaderBuilder builder; + ASSERT_OK_NO_THROW(builder.Open(std::make_shared(buffer))); + ASSERT_OK_NO_THROW(builder.memory_pool(::arrow::default_memory_pool()) + ->properties(properties) + ->Build(out)); } void ReadSingleColumnFile(std::unique_ptr file_reader, @@ -670,6 +676,7 @@ class ParquetIOTestBase : public ::testing::Test { void RoundTripSingleColumn( const std::shared_ptr& values, const std::shared_ptr& expected, const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_properties, + const ArrowReaderProperties& reader_properties = default_arrow_reader_properties(), bool nullable = true) { std::shared_ptr table = MakeSimpleTable(values, nullable); this->ResetSink(); @@ -679,7 +686,7 @@ class ParquetIOTestBase : public ::testing::Test { std::shared_ptr
out; std::unique_ptr reader; - ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader)); + ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader, reader_properties)); const bool expect_metadata = arrow_properties->store_schema(); ASSERT_NO_FATAL_FAILURE( this->ReadTableFromFile(std::move(reader), expect_metadata, &out)); @@ -1428,6 +1435,52 @@ TEST_F(TestLargeStringParquetIO, Basics) { this->RoundTripSingleColumn(large_array, large_array, arrow_properties); } +using TestJsonParquetIO = TestParquetIO<::arrow::extension::JsonExtensionType>; + +TEST_F(TestJsonParquetIO, JsonExtension) { + const char* json = R"([ + "null", + "1234", + "3.14159", + "true", + "false", + "\"a json string\"", + "[\"a\", \"json\", \"array\"]", + "{\"obj\": \"a simple json object\"}" + ])"; + + const auto json_type = ::arrow::extension::json(); + const auto string_array = ::arrow::ArrayFromJSON(::arrow::utf8(), json); + const auto json_array = ::arrow::ExtensionType::WrapArray(json_type, string_array); + + const auto json_large_type = ::arrow::extension::json(::arrow::large_utf8()); + const auto large_string_array = ::arrow::ArrayFromJSON(::arrow::large_utf8(), json); + const auto json_large_array = + ::arrow::ExtensionType::WrapArray(json_large_type, large_string_array); + + // When the original Arrow schema isn't stored and Arrow extensions are disabled, + // LogicalType::JSON is read as utf8. + this->RoundTripSingleColumn(json_array, string_array, + default_arrow_writer_properties()); + this->RoundTripSingleColumn(json_large_array, string_array, + default_arrow_writer_properties()); + + // When the original Arrow schema isn't stored and Arrow extensions are enabled, + // LogicalType::JSON is read as JsonExtensionType with utf8 storage. + ::parquet::ArrowReaderProperties reader_properties; + reader_properties.set_arrow_extensions_enabled(true); + this->RoundTripSingleColumn(json_array, json_array, default_arrow_writer_properties(), + reader_properties); + this->RoundTripSingleColumn(json_large_array, json_array, + default_arrow_writer_properties(), reader_properties); + + // When the original Arrow schema is stored, the stored Arrow type is respected. + const auto writer_properties = + ::parquet::ArrowWriterProperties::Builder().store_schema()->build(); + this->RoundTripSingleColumn(json_array, json_array, writer_properties); + this->RoundTripSingleColumn(json_large_array, json_large_array, writer_properties); +} + using TestNullParquetIO = TestParquetIO<::arrow::NullType>; TEST_F(TestNullParquetIO, NullColumn) { diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 9f60cd31d3541..31ead461aa6e2 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -31,8 +31,11 @@ #include "parquet/thrift_internal.h" #include "arrow/array.h" +#include "arrow/extension/json.h" +#include "arrow/ipc/writer.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" +#include "arrow/util/base64.h" #include "arrow/util/key_value_metadata.h" using arrow::Field; @@ -76,17 +79,17 @@ class TestConvertParquetSchema : public ::testing::Test { auto result_field = result_schema_->field(i); auto expected_field = expected_schema->field(i); EXPECT_TRUE(result_field->Equals(expected_field, check_metadata)) - << "Field " << i << "\n result: " << result_field->ToString() - << "\n expected: " << expected_field->ToString(); + << "Field " << i << "\n result: " << result_field->ToString(check_metadata) + << "\n expected: " << expected_field->ToString(check_metadata); } } ::arrow::Status ConvertSchema( const std::vector& nodes, - const std::shared_ptr& key_value_metadata = nullptr) { + const std::shared_ptr& key_value_metadata = nullptr, + ArrowReaderProperties props = ArrowReaderProperties()) { NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); descr_.Init(schema); - ArrowReaderProperties props; return FromParquetSchema(&descr_, props, key_value_metadata, &result_schema_); } @@ -230,7 +233,7 @@ TEST_F(TestConvertParquetSchema, ParquetAnnotatedFields) { ::arrow::uint64()}, {"int(64, true)", LogicalType::Int(64, true), ParquetType::INT64, -1, ::arrow::int64()}, - {"json", LogicalType::JSON(), ParquetType::BYTE_ARRAY, -1, ::arrow::binary()}, + {"json", LogicalType::JSON(), ParquetType::BYTE_ARRAY, -1, ::arrow::utf8()}, {"bson", LogicalType::BSON(), ParquetType::BYTE_ARRAY, -1, ::arrow::binary()}, {"interval", LogicalType::Interval(), ParquetType::FIXED_LEN_BYTE_ARRAY, 12, ::arrow::fixed_size_binary(12)}, @@ -724,6 +727,87 @@ TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); } +Status ArrowSchemaToParquetMetadata(std::shared_ptr<::arrow::Schema>& arrow_schema, + std::shared_ptr& metadata) { + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr serialized, + ::arrow::ipc::SerializeSchema(*arrow_schema, ::arrow::default_memory_pool())); + std::string schema_as_string = serialized->ToString(); + std::string schema_base64 = ::arrow::util::base64_encode(schema_as_string); + metadata = ::arrow::key_value_metadata({"ARROW:schema"}, {schema_base64}); + return Status::OK(); +} + +TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { + std::vector parquet_fields; + parquet_fields.push_back(PrimitiveNode::Make( + "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, ConvertedType::JSON)); + parquet_fields.push_back(PrimitiveNode::Make( + "json_2", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, ConvertedType::JSON)); + + { + // Parquet file does not contain Arrow schema. + // By default, both fields should be treated as utf8() fields in Arrow. + auto arrow_schema = ::arrow::schema( + {::arrow::field("json_1", UTF8, true), ::arrow::field("json_2", UTF8, true)}); + std::shared_ptr metadata{}; + ASSERT_OK(ConvertSchema(parquet_fields, metadata)); + CheckFlatSchema(arrow_schema); + } + + { + // Parquet file does not contain Arrow schema. + // If Arrow extensions are enabled, both fields should be treated as json() extension + // fields. + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(true); + auto arrow_schema = ::arrow::schema( + {::arrow::field("json_1", ::arrow::extension::json(), true), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), + true)}); + std::shared_ptr metadata{}; + ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + CheckFlatSchema(arrow_schema); + } + + { + // Parquet file contains Arrow schema. + // Both json_1 and json_2 should be returned as a json() field + // even though extensions are not enabled. + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(false); + std::shared_ptr field_metadata = + ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); + auto arrow_schema = ::arrow::schema( + {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), + true)}); + + std::shared_ptr metadata; + ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); + ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + CheckFlatSchema(arrow_schema, true /* check_metadata */); + } + + { + // Parquet file contains Arrow schema. Extensions are enabled. + // Both json_1 and json_2 should be returned as a json() field + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(true); + std::shared_ptr field_metadata = + ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); + auto arrow_schema = ::arrow::schema( + {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), + true)}); + + std::shared_ptr metadata; + ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); + ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + CheckFlatSchema(arrow_schema, true /* check_metadata */); + } +} + class TestConvertArrowSchema : public ::testing::Test { public: virtual void SetUp() {} diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index ec3890a41f442..1623d80dcb0e4 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -21,6 +21,7 @@ #include #include +#include "arrow/extension/json.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/api.h" @@ -427,6 +428,13 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, } case ArrowTypeId::EXTENSION: { auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(field->type()); + // Built-in JSON extension is handled differently. + if (ext_type->extension_name() == std::string("arrow.json")) { + // Set physical and logical types and instantiate primitive node. + type = ParquetType::BYTE_ARRAY; + logical_type = LogicalType::JSON(); + break; + } std::shared_ptr<::arrow::Field> storage_field = ::arrow::field( name, ext_type->storage_type(), field->nullable(), field->metadata()); return FieldToNode(name, storage_field, properties, arrow_properties, out); @@ -438,7 +446,7 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, } default: { - // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR + // TODO: DENSE_UNION, SPARE_UNION, DECIMAL_TEXT, VARCHAR return Status::NotImplemented( "Unhandled type for Arrow to Parquet schema conversion: ", field->type()->ToString()); @@ -476,9 +484,8 @@ bool IsDictionaryReadSupported(const ArrowType& type) { ::arrow::Result> GetTypeForNode( int column_index, const schema::PrimitiveNode& primitive_node, SchemaTreeContext* ctx) { - ASSIGN_OR_RAISE( - std::shared_ptr storage_type, - GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit())); + ASSIGN_OR_RAISE(std::shared_ptr storage_type, + GetArrowType(primitive_node, ctx->properties)); if (ctx->properties.read_dictionary(column_index) && IsDictionaryReadSupported(*storage_type)) { return ::arrow::dictionary(::arrow::int32(), storage_type); @@ -984,18 +991,35 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer bool modified = false; auto& origin_type = origin_field.type(); + const auto& inferred_type = inferred->field->type(); if (origin_type->id() == ::arrow::Type::EXTENSION) { const auto& ex_type = checked_cast(*origin_type); - auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); + if (inferred_type->id() != ::arrow::Type::EXTENSION && + ex_type.extension_name() == std::string("arrow.json") && + (inferred_type->id() == ::arrow::Type::STRING || + inferred_type->id() == ::arrow::Type::LARGE_STRING || + inferred_type->id() == ::arrow::Type::STRING_VIEW)) { + // Schema mismatch. + // + // Arrow extensions are DISABLED in Parquet. + // origin_type is ::arrow::extension::json() + // inferred_type is ::arrow::utf8() + // + // Origin type is restored as Arrow should be considered the source of truth. + inferred->field = inferred->field->WithType(origin_type); + RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); + } else { + auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); - // Apply metadata recursively to storage type - RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); + // Apply metadata recursively to storage type + RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); - // Restore extension type, if the storage type is the same as inferred - // from the Parquet type - if (ex_type.storage_type()->Equals(*inferred->field->type())) { - inferred->field = inferred->field->WithType(origin_type); + // Restore extension type, if the storage type is the same as inferred + // from the Parquet type + if (ex_type.storage_type()->Equals(*inferred->field->type())) { + inferred->field = inferred->field->WithType(origin_type); + } } modified = true; } else { diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index a8e2a95b9b97d..261a00940654d 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -17,8 +17,11 @@ #include "parquet/arrow/schema_internal.h" +#include "arrow/extension/json.h" #include "arrow/type.h" +#include "parquet/properties.h" + using ArrowType = ::arrow::DataType; using ArrowTypeId = ::arrow::Type; using ParquetType = parquet::Type; @@ -107,7 +110,8 @@ Result> MakeArrowTimestamp(const LogicalType& logical } } -Result> FromByteArray(const LogicalType& logical_type) { +Result> FromByteArray( + const LogicalType& logical_type, const ArrowReaderProperties& reader_properties) { switch (logical_type.type()) { case LogicalType::Type::STRING: return ::arrow::utf8(); @@ -115,9 +119,15 @@ Result> FromByteArray(const LogicalType& logical_type return MakeArrowDecimal(logical_type); case LogicalType::Type::NONE: case LogicalType::Type::ENUM: - case LogicalType::Type::JSON: case LogicalType::Type::BSON: return ::arrow::binary(); + case LogicalType::Type::JSON: + if (reader_properties.get_arrow_extensions_enabled()) { + return ::arrow::extension::json(::arrow::utf8()); + } + // When the original Arrow schema isn't stored and Arrow extensions are disabled, + // LogicalType::JSON is read as utf8(). + return ::arrow::utf8(); default: return Status::NotImplemented("Unhandled logical logical_type ", logical_type.ToString(), " for binary array"); @@ -180,7 +190,7 @@ Result> FromInt64(const LogicalType& logical_type) { Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, - const ::arrow::TimeUnit::type int96_arrow_time_unit) { + const ArrowReaderProperties& reader_properties) { if (logical_type.is_invalid() || logical_type.is_null()) { return ::arrow::null(); } @@ -193,13 +203,13 @@ Result> GetArrowType( case ParquetType::INT64: return FromInt64(logical_type); case ParquetType::INT96: - return ::arrow::timestamp(int96_arrow_time_unit); + return ::arrow::timestamp(reader_properties.coerce_int96_timestamp_unit()); case ParquetType::FLOAT: return ::arrow::float32(); case ParquetType::DOUBLE: return ::arrow::float64(); case ParquetType::BYTE_ARRAY: - return FromByteArray(logical_type); + return FromByteArray(logical_type, reader_properties); case ParquetType::FIXED_LEN_BYTE_ARRAY: return FromFLBA(logical_type, type_length); default: { @@ -212,9 +222,9 @@ Result> GetArrowType( Result> GetArrowType( const schema::PrimitiveNode& primitive, - const ::arrow::TimeUnit::type int96_arrow_time_unit) { + const ArrowReaderProperties& reader_properties) { return GetArrowType(primitive.physical_type(), *primitive.logical_type(), - primitive.type_length(), int96_arrow_time_unit); + primitive.type_length(), reader_properties); } } // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index f56ba0958ae2d..58828f85ab8e3 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -18,6 +18,7 @@ #pragma once #include "arrow/result.h" +#include "arrow/type_fwd.h" #include "parquet/schema.h" namespace arrow { @@ -28,7 +29,8 @@ namespace parquet::arrow { using ::arrow::Result; -Result> FromByteArray(const LogicalType& logical_type); +Result> FromByteArray(const LogicalType& logical_type, + bool use_known_arrow_extensions); Result> FromFLBA(const LogicalType& logical_type, int32_t physical_length); Result> FromInt32(const LogicalType& logical_type); @@ -36,10 +38,10 @@ Result> FromInt64(const LogicalType& logical_ Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, - ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO); + const ArrowReaderProperties& reader_properties); Result> GetArrowType( const schema::PrimitiveNode& primitive, - ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO); + const ArrowReaderProperties& reader_properties); } // namespace parquet::arrow diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 4d3acb491e390..7f2e371df66d7 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -870,7 +870,8 @@ class PARQUET_EXPORT ArrowReaderProperties { batch_size_(kArrowDefaultBatchSize), pre_buffer_(true), cache_options_(::arrow::io::CacheOptions::LazyDefaults()), - coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {} + coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO), + arrow_extensions_enabled_(false) {} /// \brief Set whether to use the IO thread pool to parse columns in parallel. /// @@ -941,6 +942,18 @@ class PARQUET_EXPORT ArrowReaderProperties { return coerce_int96_timestamp_unit_; } + /// Enable Parquet-supported Arrow extension types. + /// + /// When enabled, Parquet logical types will be mapped to their corresponding Arrow + /// extension types at read time, if such exist. Currently only arrow::extension::json() + /// extension type is supported. Columns whose LogicalType is JSON will be interpreted + /// as arrow::extension::json(), with storage type inferred from the serialized Arrow + /// schema if present, or `utf8` by default. + void set_arrow_extensions_enabled(bool extensions_enabled) { + arrow_extensions_enabled_ = extensions_enabled; + } + bool get_arrow_extensions_enabled() const { return arrow_extensions_enabled_; } + private: bool use_threads_; std::unordered_set read_dict_indices_; @@ -949,6 +962,7 @@ class PARQUET_EXPORT ArrowReaderProperties { ::arrow::io::IOContext io_context_; ::arrow::io::CacheOptions cache_options_; ::arrow::TimeUnit::type coerce_int96_timestamp_unit_; + bool arrow_extensions_enabled_; }; /// EXPERIMENTAL: Constructs the default ArrowReaderProperties diff --git a/docs/source/status.rst b/docs/source/status.rst index b685d4bbf8add..98374164d7ae0 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -119,7 +119,7 @@ Data Types +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Variable shape tensor | | | | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| JSON | | | ✓ | | | | | | +| JSON | ✓ | | ✓ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | UUID | ✓ | | ✓ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ From 89c08a4c7395571b3e879345b96a75ae8b7b5f63 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Sep 2024 19:32:04 +0200 Subject: [PATCH 117/130] GH-36412: [Python][CI] Fix deprecation warning about day freq alias with latest pandas (#44067) ### Rationale for this change Updating our pandas usage to follow pandas' changes (they are deprecating the `"d"` alias as alternative for `"D"`) * GitHub Issue: #36412 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/tests/test_compute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index d4307cd24f8fc..c16d2f9aacf74 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -2417,7 +2417,7 @@ def _check_temporal_rounding(ts, values, unit): "millisecond": "s", "second": "min", "minute": "h", - "hour": "d", + "hour": "D", } ta = pa.array(ts) From 7c6c42d2cf0132aac890410c953127dd38373c79 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 12 Sep 2024 09:34:46 +0900 Subject: [PATCH 118/130] MINOR: [Java] Bump com.gradle:common-custom-user-data-maven-extension from 2.0 to 2.0.1 in /java (#44024) Bumps [com.gradle:common-custom-user-data-maven-extension](https://github.com/gradle/common-custom-user-data-maven-extension) from 2.0 to 2.0.1.
Release notes

Sourced from com.gradle:common-custom-user-data-maven-extension's releases.

2.0.1

  • [NEW] JAR contains LICENSE and NOTICE files
Commits
  • 7635e5c [maven-release-plugin] prepare release v2.0.1
  • 58eaf15 Create release notes for 2.0.1
  • 40395ee Merge pull request #241 from gradle/erichaagdev/license-notice-2
  • ba1de90 Ensure this project's LICENSE and NOTICE are the only ones included in JARs
  • 4538056 Add NOTICE file and include it and LICENSE in all assembled JAR files
  • 9889c7a Revert "Add NOTICE file and include it and LICENSE in assembled JAR"
  • 43e1784 Merge pull request #240 from gradle/erichaagdev/license-notice
  • 7dade91 Add NOTICE file and include it and LICENSE in assembled JAR
  • 6e251e3 Merge pull request #239 from gradle/wrapperbot/common-custom-user-data-maven-...
  • 3ba34bd Bump Maven Wrapper from 3.9.8 to 3.9.9
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.gradle:common-custom-user-data-maven-extension&package-manager=maven&previous-version=2.0&new-version=2.0.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/.mvn/extensions.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/.mvn/extensions.xml b/java/.mvn/extensions.xml index 716e2f9e81c35..c90629a91c9ec 100644 --- a/java/.mvn/extensions.xml +++ b/java/.mvn/extensions.xml @@ -28,6 +28,6 @@ com.gradle common-custom-user-data-maven-extension - 2.0 + 2.0.1 From 837a3e2ee97e12333d325f23a0464c5f36d9f572 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 11 Sep 2024 21:13:55 -0400 Subject: [PATCH 119/130] GH-43748: [R] Handle package_version in safe_r_metadata (#43895) ### Rationale for this change See #43748. There is what appears to be a bug in R's `[[.numeric_version` implementation that leads to infinite recursion. Edit: after some digging in R source, this appears to be as designed. And other list subclasses that have methods to make them behave like atomic types, like `POSIXlt`, also have this. ### What changes are included in this PR? When recursing into list objects, `unclass()` them first to get the raw list behavior. Also apply the checking to the `attributes()` before reapplying them. ### Are these changes tested? yes ### Are there any user-facing changes? Fewer bugs! * GitHub Issue: #43748 --- r/R/metadata.R | 21 ++++++++++++++++++++- r/tests/testthat/test-metadata.R | 29 ++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/r/R/metadata.R b/r/R/metadata.R index ba73f0857881d..61e412be62450 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -107,15 +107,34 @@ safe_r_metadata <- function(metadata, on_save = FALSE) { # and mutate the `types_removed` variable outside of it. check_r_metadata_types_recursive <- function(x) { allowed_types <- c("character", "double", "integer", "logical", "complex", "list", "NULL") + # Pull out the attributes so we can also check them + x_attrs <- attributes(x) + if (is.list(x)) { + # Add special handling for some base R classes that are list but + # their [[ methods leads to infinite recursion. + # We unclass here and then reapply attributes after. + x <- unclass(x) + types <- map_chr(x, typeof) - x[types == "list"] <- map(x[types == "list"], check_r_metadata_types_recursive) ok <- types %in% allowed_types if (!all(ok)) { # Record the invalid types, then remove the offending elements types_removed <<- c(types_removed, setdiff(types, allowed_types)) x <- x[ok] + if ("names" %in% names(x_attrs)) { + # Also prune from the attributes since we'll re-add later + x_attrs[["names"]] <- x_attrs[["names"]][ok] + } } + # For the rest, recurse + x <- map(x, check_r_metadata_types_recursive) + } + + # attributes() of a named list will return a list with a "names" attribute, + # so it will recurse indefinitely. + if (!is.null(x_attrs) && !identical(x_attrs, list(names = names(x)))) { + attributes(x) <- check_r_metadata_types_recursive(x_attrs) } x } diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 175e7ef3b6b73..06aa1535e0a36 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -149,6 +149,15 @@ arbitrary\040code\040was\040just\040executed ) }) +test_that("R metadata processing doesn't choke on packageVersion() output", { + metadata <- list(version = packageVersion("base")) + expect_identical(safe_r_metadata(metadata), metadata) + + df <- example_data[1:6] + attr(df, "version") <- packageVersion("base") + expect_equal_data_frame(Table$create(df), df) +}) + test_that("Complex or unsafe attributes are pruned from R metadata, if they exist", { tab <- Table$create(example_data[1:6]) bad <- new.env() @@ -161,18 +170,24 @@ i Type: \"environment\" > If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them.", fixed = TRUE ) + # Try hiding it even further, in attributes + bad_meta <- list(attributes = structure(list(), hidden_attr = bad)) + tab$metadata <- list(r = rawToChar(serialize(bad_meta, NULL, ascii = TRUE))) + expect_warning( + as.data.frame(tab), + "Potentially unsafe or invalid elements have been discarded from R metadata. +i Type: \"environment\" +> If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them.", + fixed = TRUE + ) + # You can set an option to allow them through. # It still warns, just differently, and it doesn't prune the attributes withr::local_options(list("arrow.unsafe_metadata" = TRUE)) expect_warning( - expect_warning( - as.data.frame(tab), - "R metadata may have unsafe or invalid elements + as.data.frame(tab), + "R metadata may have unsafe or invalid elements i Type: \"environment\"" - ), - # This particular example ultimately fails because it's not a list - "Invalid metadata$r", - fixed = TRUE ) }) From 0f9ed849fe54b8ce72f31b889b102b9db205a571 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Sep 2024 08:37:46 +0200 Subject: [PATCH 120/130] GH-44063: [Python] Deprecate the no longer used serialize/deserialize Pyarrow C++ functions (#44064) ### Rationale for this change We want to remove this part of the code (since we no longer use it ourselves, see https://github.com/apache/arrow/issues/43587), and before doing that first deprecating them for two releases. * GitHub Issue: #44063 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/src/arrow/python/deserialize.h | 6 ++++++ python/pyarrow/src/arrow/python/serialize.h | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/python/pyarrow/src/arrow/python/deserialize.h b/python/pyarrow/src/arrow/python/deserialize.h index 41b6a13a38875..fe1d73622a3db 100644 --- a/python/pyarrow/src/arrow/python/deserialize.h +++ b/python/pyarrow/src/arrow/python/deserialize.h @@ -24,6 +24,7 @@ #include "arrow/python/serialize.h" #include "arrow/python/visibility.h" #include "arrow/status.h" +#include "arrow/util/macros.h" namespace arrow { @@ -55,6 +56,7 @@ struct ARROW_PYTHON_EXPORT SparseTensorCounts { /// \param[in] src a RandomAccessFile /// \param[out] out the reconstructed data /// \return Status +ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0") ARROW_PYTHON_EXPORT Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out); @@ -70,6 +72,7 @@ Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out); /// num_csf_tensors * (2 * ndim_csf + 3) + num_buffers in length /// \param[out] out the reconstructed object /// \return Status +ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0") ARROW_PYTHON_EXPORT Status GetSerializedFromComponents(int num_tensors, const SparseTensorCounts& num_sparse_tensors, @@ -88,6 +91,7 @@ Status GetSerializedFromComponents(int num_tensors, /// \param[out] out The returned object /// \return Status /// This acquires the GIL +ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0") ARROW_PYTHON_EXPORT Status DeserializeObject(PyObject* context, const SerializedPyObject& object, PyObject* base, PyObject** out); @@ -96,9 +100,11 @@ Status DeserializeObject(PyObject* context, const SerializedPyObject& object, /// \param[in] object Object to deserialize /// \param[out] out The deserialized tensor /// \return Status +ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0") ARROW_PYTHON_EXPORT Status DeserializeNdarray(const SerializedPyObject& object, std::shared_ptr* out); +ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0") ARROW_PYTHON_EXPORT Status NdarrayFromBuffer(std::shared_ptr src, std::shared_ptr* out); diff --git a/python/pyarrow/src/arrow/python/serialize.h b/python/pyarrow/src/arrow/python/serialize.h index fd207d3e06903..af6d2d81a61c4 100644 --- a/python/pyarrow/src/arrow/python/serialize.h +++ b/python/pyarrow/src/arrow/python/serialize.h @@ -24,6 +24,7 @@ #include "arrow/python/visibility.h" #include "arrow/sparse_tensor.h" #include "arrow/status.h" +#include "arrow/util/macros.h" // Forward declaring PyObject, see // https://mail.python.org/pipermail/python-dev/2003-August/037601.html @@ -92,6 +93,7 @@ struct ARROW_PYTHON_EXPORT SerializedPyObject { /// \return Status /// /// Release GIL before calling +ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0") ARROW_PYTHON_EXPORT Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out); @@ -99,6 +101,7 @@ Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject /// \param[in] tensor Tensor to be serialized /// \param[out] out The serialized representation /// \return Status +ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0") ARROW_PYTHON_EXPORT Status SerializeTensor(std::shared_ptr tensor, py::SerializedPyObject* out); @@ -108,6 +111,7 @@ Status SerializeTensor(std::shared_ptr tensor, py::SerializedPyObject* o /// \param[in] tensor_num_bytes The length of the Tensor data in bytes /// \param[in] dst The OutputStream to write the Tensor header to /// \return Status +ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0") ARROW_PYTHON_EXPORT Status WriteNdarrayHeader(std::shared_ptr dtype, const std::vector& shape, int64_t tensor_num_bytes, From 002b301aca5b155e111d2958653669ff67d2b205 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 12 Sep 2024 10:42:22 +0200 Subject: [PATCH 121/130] GH-44072: [C++][Parquet] Add Float16 reading benchmarks (#44073) Local benchmark numbers: ``` --------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------------------------------------------------- BM_ReadColumnPlain/null_probability:-1 20038480 ns 20019703 ns 36 bytes_per_second=1.9512Gi/s items_per_second=523.772M/s BM_ReadColumnPlain/null_probability:0 37114403 ns 36766588 ns 19 bytes_per_second=1.06245Gi/s items_per_second=285.198M/s BM_ReadColumnPlain/null_probability:1 44589582 ns 44371707 ns 16 bytes_per_second=901.475Mi/s items_per_second=236.316M/s BM_ReadColumnPlain/null_probability:50 65624754 ns 65322683 ns 11 bytes_per_second=612.345Mi/s items_per_second=160.522M/s BM_ReadColumnPlain/null_probability:99 43072631 ns 42932582 ns 16 bytes_per_second=931.693Mi/s items_per_second=244.238M/s BM_ReadColumnPlain/null_probability:100 36710045 ns 36475141 ns 19 bytes_per_second=1.07093Gi/s items_per_second=287.477M/s BM_ReadColumnPlain/null_probability:-1 52718868 ns 52616204 ns 12 bytes_per_second=380.111Mi/s items_per_second=199.288M/s BM_ReadColumnPlain/null_probability:0 71273144 ns 71093105 ns 10 bytes_per_second=281.321Mi/s items_per_second=147.493M/s BM_ReadColumnPlain/null_probability:1 80674727 ns 80358048 ns 8 bytes_per_second=248.886Mi/s items_per_second=130.488M/s BM_ReadColumnPlain/null_probability:50 138249159 ns 137922632 ns 5 bytes_per_second=145.009Mi/s items_per_second=76.0264M/s BM_ReadColumnPlain/null_probability:99 86938382 ns 86576176 ns 8 bytes_per_second=231.01Mi/s items_per_second=121.116M/s BM_ReadColumnPlain/null_probability:100 74154244 ns 73984356 ns 9 bytes_per_second=270.327Mi/s items_per_second=141.729M/s ``` * GitHub Issue: #44072 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../parquet/arrow/reader_writer_benchmark.cc | 86 ++++++++++++++++--- 1 file changed, 75 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index 95c4a659297d9..b12f234f72bdf 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -28,6 +28,7 @@ #include "parquet/file_reader.h" #include "parquet/file_writer.h" #include "parquet/platform.h" +#include "parquet/properties.h" #include "arrow/array.h" #include "arrow/array/builder_primitive.h" @@ -88,6 +89,11 @@ struct benchmark_traits { using arrow_type = ::arrow::BooleanType; }; +template <> +struct benchmark_traits { + using arrow_type = ::arrow::HalfFloatType; +}; + template using ArrowType = typename benchmark_traits::arrow_type; @@ -125,15 +131,15 @@ std::vector RandomVector(int64_t true_percentage, int64_t vector_size, return values; } -template +template > std::shared_ptr<::arrow::Table> TableFromVector( - const std::vector& vec, bool nullable, + const std::vector& vec, bool nullable, int64_t null_percentage = kAlternatingOrNa) { if (!nullable) { ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa); } - std::shared_ptr<::arrow::DataType> type = std::make_shared>(); - NumericBuilder> builder; + std::shared_ptr<::arrow::DataType> type = std::make_shared(); + NumericBuilder builder; if (nullable) { // Note true values select index 1 of sample_values auto valid_bytes = RandomVector(/*true_percentage=*/null_percentage, @@ -258,18 +264,20 @@ struct Examples { }; static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table, + std::shared_ptr properties, int64_t num_values = -1, int64_t total_bytes = -1) { auto output = CreateOutputStream(); - EXIT_NOT_OK( - WriteTable(table, ::arrow::default_memory_pool(), output, table.num_rows())); + EXIT_NOT_OK(WriteTable(table, ::arrow::default_memory_pool(), output, + /*chunk_size=*/table.num_rows(), properties)); PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish()); - while (state.KeepRunning()) { + for (auto _ : state) { auto reader = ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer)); std::unique_ptr arrow_reader; EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &arrow_reader)); + std::shared_ptr<::arrow::Table> table; EXIT_NOT_OK(arrow_reader->ReadTable(&table)); } @@ -283,8 +291,14 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& } } +static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table, + int64_t num_values = -1, int64_t total_bytes = -1) { + BenchmarkReadTable(state, table, default_writer_properties(), num_values, total_bytes); +} + static void BenchmarkReadArray(::benchmark::State& state, const std::shared_ptr& array, bool nullable, + std::shared_ptr properties, int64_t num_values = -1, int64_t total_bytes = -1) { auto schema = ::arrow::schema({field("s", array->type(), nullable)}); auto table = ::arrow::Table::Make(schema, {array}, array->length()); @@ -294,8 +308,15 @@ static void BenchmarkReadArray(::benchmark::State& state, BenchmarkReadTable(state, *table, num_values, total_bytes); } +static void BenchmarkReadArray(::benchmark::State& state, + const std::shared_ptr& array, bool nullable, + int64_t num_values = -1, int64_t total_bytes = -1) { + BenchmarkReadArray(state, array, nullable, default_writer_properties(), num_values, + total_bytes); +} + // -// Benchmark reading a primitive column +// Benchmark reading a dict-encoded primitive column // template @@ -308,7 +329,9 @@ static void BM_ReadColumn(::benchmark::State& state) { std::shared_ptr<::arrow::Table> table = TableFromVector(values, nullable, state.range(0)); - BenchmarkReadTable(state, *table, table->num_rows(), + auto properties = WriterProperties::Builder().disable_dictionary()->build(); + + BenchmarkReadTable(state, *table, properties, table->num_rows(), sizeof(typename ParquetType::c_type) * table->num_rows()); } @@ -316,8 +339,9 @@ static void BM_ReadColumn(::benchmark::State& state) { // null_percentage governs distribution and therefore runs of null values. // first_value_percentage governs distribution of values (we select from 1 of 2) // so when 0 or 100 RLE is triggered all the time. When a value in the range (0, 100) -// there will be some percentage of RLE encoded values and some percentage of literal -// encoded values (RLE is much less likely with percentages close to 50). +// there will be some percentage of RLE-encoded dictionary indices and some +// percentage of literal encoded dictionary indices +// (RLE is much less likely with percentages close to 50). BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type) ->Args({/*null_percentage=*/kAlternatingOrNa, 1}) ->Args({/*null_percentage=*/kAlternatingOrNa, 10}) @@ -325,6 +349,7 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type) BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type) ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0}) + ->Args({/*null_percentage=*/0, /*first_value_percentage=*/1}) ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1}) ->Args({/*null_percentage=*/10, /*first_value_percentage=*/10}) ->Args({/*null_percentage=*/25, /*first_value_percentage=*/5}) @@ -369,6 +394,45 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType) ->Args({kAlternatingOrNa, 1}) ->Args({5, 10}); +// +// Benchmark reading a PLAIN-encoded primitive column +// + +template +static void BM_ReadColumnPlain(::benchmark::State& state) { + using c_type = typename ArrowType::c_type; + + const std::vector values(BENCHMARK_SIZE, static_cast(42)); + std::shared_ptr<::arrow::Table> table = + TableFromVector(values, /*nullable=*/nullable, state.range(0)); + + auto properties = WriterProperties::Builder().disable_dictionary()->build(); + BenchmarkReadTable(state, *table, properties, table->num_rows(), + sizeof(c_type) * table->num_rows()); +} + +BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Int32Type) + ->ArgNames({"null_probability"}) + ->Args({kAlternatingOrNa}); +BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Int32Type) + ->ArgNames({"null_probability"}) + ->Args({0}) + ->Args({1}) + ->Args({50}) + ->Args({99}) + ->Args({100}); + +BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Float16LogicalType) + ->ArgNames({"null_probability"}) + ->Args({kAlternatingOrNa}); +BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Float16LogicalType) + ->ArgNames({"null_probability"}) + ->Args({0}) + ->Args({1}) + ->Args({50}) + ->Args({99}) + ->Args({100}); + // // Benchmark reading binary column // From a76ab32a0b93d3ddc401a54cde7491556e39e143 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 12 Sep 2024 16:29:38 +0200 Subject: [PATCH 122/130] GH-44081: [C++][Parquet] Fix reported metrics in parquet-arrow-reader-writer-benchmark (#44082) ### Rationale for this change 1. items/sec and bytes/sec were set to the same value in some benchmarks 2. bytes/sec was incorrectly computed for boolean columns ### What changes are included in this PR? Fix parquet-arrow-reader-writer-benchmark to report correct metrics. #### Example (column writing) Before: ``` -------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... -------------------------------------------------------------------------------------------------------------------- BM_WriteColumn 43138428 ns 43118609 ns 15 bytes_per_second=927.674Mi/s items_per_second=972.736M/s BM_WriteColumn 150528627 ns 150480597 ns 5 bytes_per_second=265.815Mi/s items_per_second=278.727M/s BM_WriteColumn 49243514 ns 49214955 ns 14 bytes_per_second=1.58742Gi/s items_per_second=1.70448G/s BM_WriteColumn 151526550 ns 151472832 ns 5 bytes_per_second=528.148Mi/s items_per_second=553.803M/s BM_WriteColumn 59101372 ns 59068058 ns 12 bytes_per_second=1.32263Gi/s items_per_second=1.42016G/s BM_WriteColumn 159944872 ns 159895095 ns 4 bytes_per_second=500.328Mi/s items_per_second=524.632M/s BM_WriteColumn 32855604 ns 32845322 ns 21 bytes_per_second=304.457Mi/s items_per_second=319.247M/s BM_WriteColumn 150566118 ns 150528329 ns 5 bytes_per_second=66.4327Mi/s items_per_second=69.6597M/s ``` After: ``` Benchmark Time CPU Iterations UserCounters... -------------------------------------------------------------------------------------------------------------------- BM_WriteColumn 43919180 ns 43895926 ns 16 bytes_per_second=911.246Mi/s items_per_second=238.878M/s BM_WriteColumn 153981290 ns 153929841 ns 5 bytes_per_second=259.859Mi/s items_per_second=68.1204M/s BM_WriteColumn 49906105 ns 49860098 ns 14 bytes_per_second=1.56688Gi/s items_per_second=210.304M/s BM_WriteColumn 154273499 ns 154202319 ns 5 bytes_per_second=518.799Mi/s items_per_second=68M/s BM_WriteColumn 59789490 ns 59733498 ns 12 bytes_per_second=1.30789Gi/s items_per_second=175.542M/s BM_WriteColumn 161235860 ns 161169670 ns 4 bytes_per_second=496.371Mi/s items_per_second=65.0604M/s BM_WriteColumn 32962097 ns 32950864 ns 21 bytes_per_second=37.9353Mi/s items_per_second=318.224M/s BM_WriteColumn 154103499 ns 154052873 ns 5 bytes_per_second=8.1141Mi/s items_per_second=68.066M/s ``` #### Example (column reading) Before: ``` --------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------------------------------------------------- BM_ReadColumn/-1/0 6456731 ns 6453510 ns 108 bytes_per_second=1.51323Gi/s items_per_second=1.62482G/s BM_ReadColumn/1/20 19012505 ns 19006068 ns 36 bytes_per_second=526.148Mi/s items_per_second=551.706M/s BM_ReadColumn/-1/1 58365426 ns 58251529 ns 12 bytes_per_second=171.669Mi/s items_per_second=180.008M/s BM_ReadColumn/5/10 46498966 ns 46442191 ns 15 bytes_per_second=215.321Mi/s items_per_second=225.781M/s BM_ReadIndividualRowGroups 29617575 ns 29600557 ns 24 bytes_per_second=2.63931Gi/s items_per_second=2.83394G/s BM_ReadMultipleRowGroups 47416980 ns 47288951 ns 15 bytes_per_second=1.65208Gi/s items_per_second=1.7739G/s BM_ReadMultipleRowGroupsGenerator 29741012 ns 29722112 ns 24 bytes_per_second=2.62851Gi/s items_per_second=2.82235G/s ``` After: ``` --------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------------------------------------------------- BM_ReadColumn/-1/0 6438249 ns 6435159 ns 109 bytes_per_second=194.245Mi/s items_per_second=1.62945G/s BM_ReadColumn/1/20 19427495 ns 19419378 ns 37 bytes_per_second=64.3687Mi/s items_per_second=539.964M/s BM_ReadColumn/-1/1 58342877 ns 58298236 ns 12 bytes_per_second=21.4415Mi/s items_per_second=179.864M/s BM_ReadColumn/5/10 46591584 ns 46532288 ns 15 bytes_per_second=26.8631Mi/s items_per_second=225.344M/s BM_ReadIndividualRowGroups 30039049 ns 30021676 ns 23 bytes_per_second=2.60229Gi/s items_per_second=349.273M/s BM_ReadMultipleRowGroups 47877663 ns 47650438 ns 15 bytes_per_second=1.63954Gi/s items_per_second=220.056M/s BM_ReadMultipleRowGroupsGenerator 30377987 ns 30360019 ns 23 bytes_per_second=2.57329Gi/s items_per_second=345.381M/s ``` ### Are these changes tested? Manually by running benchmarks. ### Are there any user-facing changes? No, but this breaks historical comparisons in continuous benchmarking. * GitHub Issue: #44081 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../parquet/arrow/reader_writer_benchmark.cc | 95 +++++++++++-------- 1 file changed, 56 insertions(+), 39 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index b12f234f72bdf..283b113dfe992 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include "parquet/arrow/reader.h" #include "parquet/arrow/writer.h" @@ -37,6 +38,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/util/async_generator.h" +#include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" @@ -45,6 +47,7 @@ using arrow::ArrayVector; using arrow::BooleanBuilder; using arrow::FieldVector; using arrow::NumericBuilder; +using arrow::Table; #define EXIT_NOT_OK(s) \ do { \ @@ -104,13 +107,28 @@ std::shared_ptr MakeSchema(Repetition::type repetition) { repetition == Repetition::REPEATED); } -template +template +int64_t BytesForItems(int64_t num_items) { + static_assert(!std::is_same_v, + "BytesForItems unsupported for FLBAType"); + return num_items * sizeof(typename ParquetType::c_type); +} + +template <> +int64_t BytesForItems(int64_t num_items) { + return ::arrow::bit_util::BytesForBits(num_items); +} + +template <> +int64_t BytesForItems(int64_t num_items) { + return num_items * sizeof(uint16_t); +} + +template void SetBytesProcessed(::benchmark::State& state, int64_t num_values = BENCHMARK_SIZE) { const int64_t items_processed = state.iterations() * num_values; - const int64_t bytes_processed = items_processed * sizeof(typename ParquetType::c_type); - - state.SetItemsProcessed(bytes_processed); - state.SetBytesProcessed(bytes_processed); + state.SetItemsProcessed(items_processed); + state.SetBytesProcessed(BytesForItems(items_processed)); } constexpr int64_t kAlternatingOrNa = -1; @@ -132,9 +150,9 @@ std::vector RandomVector(int64_t true_percentage, int64_t vector_size, } template > -std::shared_ptr<::arrow::Table> TableFromVector( - const std::vector& vec, bool nullable, - int64_t null_percentage = kAlternatingOrNa) { +std::shared_ptr
TableFromVector(const std::vector& vec, + bool nullable, + int64_t null_percentage = kAlternatingOrNa) { if (!nullable) { ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa); } @@ -153,13 +171,12 @@ std::shared_ptr<::arrow::Table> TableFromVector( auto field = ::arrow::field("column", type, nullable); auto schema = ::arrow::schema({field}); - return ::arrow::Table::Make(schema, {array}); + return Table::Make(schema, {array}); } template <> -std::shared_ptr<::arrow::Table> TableFromVector(const std::vector& vec, - bool nullable, - int64_t null_percentage) { +std::shared_ptr
TableFromVector( + const std::vector& vec, bool nullable, int64_t null_percentage) { BooleanBuilder builder; if (nullable) { auto valid_bytes = RandomVector(/*true_percentage=*/null_percentage, vec.size(), @@ -174,21 +191,21 @@ std::shared_ptr<::arrow::Table> TableFromVector(const std::vector( std::vector>({field})); - return ::arrow::Table::Make(schema, {array}); + return Table::Make(schema, {array}); } template static void BM_WriteColumn(::benchmark::State& state) { using T = typename ParquetType::c_type; std::vector values(BENCHMARK_SIZE, static_cast(128)); - std::shared_ptr<::arrow::Table> table = TableFromVector(values, nullable); + std::shared_ptr
table = TableFromVector(values, nullable); while (state.KeepRunning()) { auto output = CreateOutputStream(); EXIT_NOT_OK( WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE)); } - SetBytesProcessed(state); + SetBytesProcessed(state); } BENCHMARK_TEMPLATE2(BM_WriteColumn, false, Int32Type); @@ -205,8 +222,8 @@ BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType); int32_t kInfiniteUniqueValues = -1; -std::shared_ptr<::arrow::Table> RandomStringTable(int64_t length, int64_t unique_values, - int64_t null_percentage) { +std::shared_ptr
RandomStringTable(int64_t length, int64_t unique_values, + int64_t null_percentage) { std::shared_ptr<::arrow::DataType> type = ::arrow::utf8(); std::shared_ptr<::arrow::Array> arr; ::arrow::random::RandomArrayGenerator generator(/*seed=*/500); @@ -219,12 +236,12 @@ std::shared_ptr<::arrow::Table> RandomStringTable(int64_t length, int64_t unique /*min_length=*/3, /*max_length=*/32, /*null_probability=*/null_probability); } - return ::arrow::Table::Make( + return Table::Make( ::arrow::schema({::arrow::field("column", type, null_percentage > 0)}), {arr}); } static void BM_WriteBinaryColumn(::benchmark::State& state) { - std::shared_ptr<::arrow::Table> table = + std::shared_ptr
table = RandomStringTable(BENCHMARK_SIZE, state.range(1), state.range(0)); while (state.KeepRunning()) { @@ -263,7 +280,7 @@ struct Examples { static constexpr std::array values() { return {false, true}; } }; -static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table, +static void BenchmarkReadTable(::benchmark::State& state, const Table& table, std::shared_ptr properties, int64_t num_values = -1, int64_t total_bytes = -1) { auto output = CreateOutputStream(); @@ -278,7 +295,7 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &arrow_reader)); - std::shared_ptr<::arrow::Table> table; + std::shared_ptr
table; EXIT_NOT_OK(arrow_reader->ReadTable(&table)); } @@ -291,7 +308,7 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& } } -static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table, +static void BenchmarkReadTable(::benchmark::State& state, const Table& table, int64_t num_values = -1, int64_t total_bytes = -1) { BenchmarkReadTable(state, table, default_writer_properties(), num_values, total_bytes); } @@ -301,7 +318,7 @@ static void BenchmarkReadArray(::benchmark::State& state, std::shared_ptr properties, int64_t num_values = -1, int64_t total_bytes = -1) { auto schema = ::arrow::schema({field("s", array->type(), nullable)}); - auto table = ::arrow::Table::Make(schema, {array}, array->length()); + auto table = Table::Make(schema, {array}, array->length()); EXIT_NOT_OK(table->Validate()); @@ -326,13 +343,13 @@ static void BM_ReadColumn(::benchmark::State& state) { auto values = RandomVector(/*percentage=*/state.range(1), BENCHMARK_SIZE, Examples::values()); - std::shared_ptr<::arrow::Table> table = + std::shared_ptr
table = TableFromVector(values, nullable, state.range(0)); auto properties = WriterProperties::Builder().disable_dictionary()->build(); BenchmarkReadTable(state, *table, properties, table->num_rows(), - sizeof(typename ParquetType::c_type) * table->num_rows()); + BytesForItems(table->num_rows())); } // There are two parameters here that cover different data distributions. @@ -403,12 +420,12 @@ static void BM_ReadColumnPlain(::benchmark::State& state) { using c_type = typename ArrowType::c_type; const std::vector values(BENCHMARK_SIZE, static_cast(42)); - std::shared_ptr<::arrow::Table> table = + std::shared_ptr
table = TableFromVector(values, /*nullable=*/nullable, state.range(0)); auto properties = WriterProperties::Builder().disable_dictionary()->build(); BenchmarkReadTable(state, *table, properties, table->num_rows(), - sizeof(c_type) * table->num_rows()); + BytesForItems(table->num_rows())); } BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Int32Type) @@ -438,7 +455,7 @@ BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Float16LogicalType) // static void BM_ReadBinaryColumn(::benchmark::State& state) { - std::shared_ptr<::arrow::Table> table = + std::shared_ptr
table = RandomStringTable(BENCHMARK_SIZE, state.range(1), state.range(0)); // Offsets + data @@ -636,7 +653,7 @@ BENCHMARK(BM_ReadListOfListColumn)->Apply(NestedReadArguments); static void BM_ReadIndividualRowGroups(::benchmark::State& state) { std::vector values(BENCHMARK_SIZE, 128); - std::shared_ptr<::arrow::Table> table = TableFromVector(values, true); + std::shared_ptr
table = TableFromVector(values, true); auto output = CreateOutputStream(); // This writes 10 RowGroups EXIT_NOT_OK( @@ -651,27 +668,27 @@ static void BM_ReadIndividualRowGroups(::benchmark::State& state) { EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &arrow_reader)); - std::vector> tables; + std::vector> tables; for (int i = 0; i < arrow_reader->num_row_groups(); i++) { // Only read the even numbered RowGroups if ((i % 2) == 0) { - std::shared_ptr<::arrow::Table> table; + std::shared_ptr
table; EXIT_NOT_OK(arrow_reader->RowGroup(i)->ReadTable(&table)); tables.push_back(table); } } - std::shared_ptr<::arrow::Table> final_table; + std::shared_ptr
final_table; PARQUET_ASSIGN_OR_THROW(final_table, ConcatenateTables(tables)); } - SetBytesProcessed(state); + SetBytesProcessed(state); } BENCHMARK(BM_ReadIndividualRowGroups); static void BM_ReadMultipleRowGroups(::benchmark::State& state) { std::vector values(BENCHMARK_SIZE, 128); - std::shared_ptr<::arrow::Table> table = TableFromVector(values, true); + std::shared_ptr
table = TableFromVector(values, true); auto output = CreateOutputStream(); // This writes 10 RowGroups EXIT_NOT_OK( @@ -685,17 +702,17 @@ static void BM_ReadMultipleRowGroups(::benchmark::State& state) { std::unique_ptr arrow_reader; EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &arrow_reader)); - std::shared_ptr<::arrow::Table> table; + std::shared_ptr
table; EXIT_NOT_OK(arrow_reader->ReadRowGroups(rgs, &table)); } - SetBytesProcessed(state); + SetBytesProcessed(state); } BENCHMARK(BM_ReadMultipleRowGroups); static void BM_ReadMultipleRowGroupsGenerator(::benchmark::State& state) { std::vector values(BENCHMARK_SIZE, 128); - std::shared_ptr<::arrow::Table> table = TableFromVector(values, true); + std::shared_ptr
table = TableFromVector(values, true); auto output = CreateOutputStream(); // This writes 10 RowGroups EXIT_NOT_OK( @@ -714,9 +731,9 @@ static void BM_ReadMultipleRowGroupsGenerator(::benchmark::State& state) { arrow_reader->GetRecordBatchGenerator(arrow_reader, rgs, {0})); auto fut = ::arrow::CollectAsyncGenerator(generator); ASSIGN_OR_ABORT(auto batches, fut.result()); - ASSIGN_OR_ABORT(auto actual, ::arrow::Table::FromRecordBatches(std::move(batches))); + ASSIGN_OR_ABORT(auto actual, Table::FromRecordBatches(std::move(batches))); } - SetBytesProcessed(state); + SetBytesProcessed(state); } BENCHMARK(BM_ReadMultipleRowGroupsGenerator); From 5fd9d74afbca0e2015eee88e268619203e8a8d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 12 Sep 2024 16:36:05 +0200 Subject: [PATCH 123/130] GH-44076: [CI] Remove verify-rc-binaries-wheel-macos-11 which is now deprecated (#44077) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Our wheels deployment target is now MACOSX_DEPLOYMENT_TARGET=12.0 and the macOS 11 runner is deprecated. ### What changes are included in this PR? Remove macos-11 from CI matrix. ### Are these changes tested? No, those jobs are triggered on release and is just removing a job from the matrix. ### Are there any user-facing changes? No * GitHub Issue: #44076 Authored-by: Raúl Cumplido Signed-off-by: Jacob Wujciak-Jens --- dev/tasks/tasks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index c1c15a3ff73fd..9f13245c53f4a 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -996,7 +996,7 @@ tasks: github_runner: "macos-14" {% endfor %} - {% for macos_version in ["11", "12"] %} + {% for macos_version in ["12"] %} verify-rc-binaries-wheels-macos-{{ macos_version }}-amd64: ci: github template: verify-rc/github.macos.yml From 1fe30d3b6b5c523814375ad4a161946dfb87b37f Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 12 Sep 2024 18:26:46 +0300 Subject: [PATCH 124/130] GH-44046: [Python] Fix threading issues with borrowed refs and pandas (#44047) ### Rationale for this change Fix threading bugs that could leads to races under the free-threaded build. ### What changes are included in this PR? - Use `PySequence_ITEM` instead of the `Fast` variant on lists under the free-threaded build. - Use `std::once_flag` to make sure that `pandas` staic data only gets initialized once. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #44046 Lead-authored-by: Lysandros Nikolaou Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- python/pyarrow/src/arrow/python/helpers.cc | 38 ++++++++++++++----- python/pyarrow/src/arrow/python/iterators.h | 4 ++ .../pyarrow/src/arrow/python/numpy_convert.cc | 12 ++++++ 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/src/arrow/python/helpers.cc b/python/pyarrow/src/arrow/python/helpers.cc index 18302e6fe0401..ca89ebe9d8bdd 100644 --- a/python/pyarrow/src/arrow/python/helpers.cc +++ b/python/pyarrow/src/arrow/python/helpers.cc @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -292,7 +293,15 @@ bool PyFloat_IsNaN(PyObject* obj) { namespace { +// This needs a conditional, because using std::once_flag could introduce +// a deadlock when the GIL is enabled. See +// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 for +// more info. +#ifdef Py_GIL_DISABLED +static std::once_flag pandas_static_initialized; +#else static bool pandas_static_initialized = false; +#endif // Once initialized, these variables hold borrowed references to Pandas static data. // We should not use OwnedRef here because Python destructors would be @@ -304,15 +313,7 @@ static PyObject* pandas_Timestamp = nullptr; static PyTypeObject* pandas_NaTType = nullptr; static PyObject* pandas_DateOffset = nullptr; -} // namespace - -void InitPandasStaticData() { - // NOTE: This is called with the GIL held. We needn't (and shouldn't, - // to avoid deadlocks) use an additional C++ lock (ARROW-10519). - if (pandas_static_initialized) { - return; - } - +void GetPandasStaticSymbols() { OwnedRef pandas; // Import pandas @@ -321,11 +322,14 @@ void InitPandasStaticData() { return; } +#ifndef Py_GIL_DISABLED // Since ImportModule can release the GIL, another thread could have // already initialized the static data. if (pandas_static_initialized) { return; } +#endif + OwnedRef ref; // set NaT sentinel and its type @@ -355,9 +359,25 @@ void InitPandasStaticData() { if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) { pandas_DateOffset = ref.obj(); } +} + +} // namespace +#ifdef Py_GIL_DISABLED +void InitPandasStaticData() { + std::call_once(pandas_static_initialized, GetPandasStaticSymbols); +} +#else +void InitPandasStaticData() { + // NOTE: This is called with the GIL held. We needn't (and shouldn't, + // to avoid deadlocks) use an additional C++ lock (ARROW-10519). + if (pandas_static_initialized) { + return; + } + GetPandasStaticSymbols(); pandas_static_initialized = true; } +#endif bool PandasObjectIsNull(PyObject* obj) { if (!MayHaveNaN(obj)) { diff --git a/python/pyarrow/src/arrow/python/iterators.h b/python/pyarrow/src/arrow/python/iterators.h index 8512276848272..dd467f6ac4077 100644 --- a/python/pyarrow/src/arrow/python/iterators.h +++ b/python/pyarrow/src/arrow/python/iterators.h @@ -67,7 +67,11 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&& } if (PySequence_Check(obj)) { +#ifdef Py_GIL_DISABLED + if (PyTuple_Check(obj)) { +#else if (PyList_Check(obj) || PyTuple_Check(obj)) { +#endif // Use fast item access const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj); for (Py_ssize_t i = offset; keep_going && i < size; ++i) { diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc b/python/pyarrow/src/arrow/python/numpy_convert.cc index 5fd2cb511ff8a..4113cc67d2fc6 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.cc +++ b/python/pyarrow/src/arrow/python/numpy_convert.cc @@ -488,7 +488,13 @@ Status NdarraysToSparseCSFTensor(MemoryPool* pool, PyObject* data_ao, PyObject* std::vector> indices(ndim); for (int i = 0; i < ndim - 1; ++i) { +#ifdef Py_GIL_DISABLED + PyObject* item = PySequence_ITEM(indptr_ao, i); + RETURN_IF_PYERROR(); + OwnedRef item_ref(item); +#else PyObject* item = PySequence_Fast_GET_ITEM(indptr_ao, i); +#endif if (!PyArray_Check(item)) { return Status::TypeError("Did not pass ndarray object for indptr"); } @@ -497,7 +503,13 @@ Status NdarraysToSparseCSFTensor(MemoryPool* pool, PyObject* data_ao, PyObject* } for (int i = 0; i < ndim; ++i) { +#ifdef Py_GIL_DISABLED + PyObject* item = PySequence_ITEM(indices_ao, i); + RETURN_IF_PYERROR(); + OwnedRef item_ref(item); +#else PyObject* item = PySequence_Fast_GET_ITEM(indices_ao, i); +#endif if (!PyArray_Check(item)) { return Status::TypeError("Did not pass ndarray object for indices"); } From d2dd352b9121b1e9e1114155a2e6979f4665986a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 12 Sep 2024 17:43:46 +0200 Subject: [PATCH 125/130] MINOR: [CI] Bump actions/{download,upload}-artifact version (#44086) v2 and v3 are deprecated and can fail CI builds, bump to v4. Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- dev/tasks/docker-tests/github.linux.yml | 2 +- dev/tasks/java-jars/github.yml | 8 ++++---- dev/tasks/python-wheels/github.linux.yml | 2 +- dev/tasks/python-wheels/github.osx.yml | 2 +- dev/tasks/python-wheels/github.windows.yml | 2 +- dev/tasks/r/github.devdocs.yml | 2 +- .../r/github.linux.arrow.version.back.compat.yml | 4 ++-- dev/tasks/r/github.linux.cran.yml | 4 ++-- dev/tasks/r/github.linux.offline.build.yml | 6 +++--- dev/tasks/r/github.linux.versions.yml | 4 ++-- dev/tasks/r/github.macos-linux.local.yml | 4 ++-- dev/tasks/r/github.macos.cran.yml | 2 +- dev/tasks/r/github.packages.yml | 16 ++++++++-------- 13 files changed, 29 insertions(+), 29 deletions(-) diff --git a/dev/tasks/docker-tests/github.linux.yml b/dev/tasks/docker-tests/github.linux.yml index cd2923a50d6df..ee221d6f6d8d6 100644 --- a/dev/tasks/docker-tests/github.linux.yml +++ b/dev/tasks/docker-tests/github.linux.yml @@ -63,7 +63,7 @@ jobs: done - name: Save the R test output if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: test-output path: arrow/r/check/arrow.Rcheck/tests/testthat.Rout* diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index bdbed1bd678e6..9910daa21ef37 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -59,7 +59,7 @@ jobs: - name: Compress into single artifact to keep directory structure run: tar -cvzf arrow-shared-libs-linux-{{ arch }}.tar.gz arrow/java-dist/ - name: Upload artifacts - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: ubuntu-shared-lib-{{ arch }} path: arrow-shared-libs-linux-{{ arch }}.tar.gz @@ -152,7 +152,7 @@ jobs: - name: Compress into single artifact to keep directory structure run: tar -cvzf arrow-shared-libs-macos-{{ arch }}.tar.gz arrow/java-dist/ - name: Upload artifacts - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: macos-shared-lib-{{ arch }} path: arrow-shared-libs-macos-{{ arch }}.tar.gz @@ -186,7 +186,7 @@ jobs: shell: bash run: tar -cvzf arrow-shared-libs-windows.tar.gz arrow/java-dist/ - name: Upload artifacts - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: windows-shared-lib path: arrow-shared-libs-windows.tar.gz @@ -201,7 +201,7 @@ jobs: steps: {{ macros.github_checkout_arrow(fetch_depth=0)|indent }} - name: Download Libraries - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: path: artifacts - name: Decompress artifacts diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml index d9dbef82a948e..faca698b71a4d 100644 --- a/dev/tasks/python-wheels/github.linux.yml +++ b/dev/tasks/python-wheels/github.linux.yml @@ -50,7 +50,7 @@ jobs: shell: bash run: archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-manylinux-{{ manylinux_version }} - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: wheel path: arrow/python/repaired_wheels/*.whl diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml index 98e06a14ff222..5d85e7905726e 100644 --- a/dev/tasks/python-wheels/github.osx.yml +++ b/dev/tasks/python-wheels/github.osx.yml @@ -108,7 +108,7 @@ jobs: pip install --upgrade pip wheel PYTHON=python arrow/ci/scripts/python_wheel_macos_build.sh {{ arch }} $(pwd)/arrow $(pwd)/build - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: wheel path: arrow/python/repaired_wheels/*.whl diff --git a/dev/tasks/python-wheels/github.windows.yml b/dev/tasks/python-wheels/github.windows.yml index 3a943b6ae515c..2bcda4966db8b 100644 --- a/dev/tasks/python-wheels/github.windows.yml +++ b/dev/tasks/python-wheels/github.windows.yml @@ -58,7 +58,7 @@ jobs: ) archery docker run --no-build -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2019 - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: wheel path: arrow/python/dist/*.whl diff --git a/dev/tasks/r/github.devdocs.yml b/dev/tasks/r/github.devdocs.yml index 530fb5e2f2ea9..6047951155cde 100644 --- a/dev/tasks/r/github.devdocs.yml +++ b/dev/tasks/r/github.devdocs.yml @@ -68,7 +68,7 @@ jobs: EOF shell: bash -l {0} - name: Save the install script - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: {{ "devdocs-script_os-${{ matrix.os }}_sysinstall-${{ matrix.system-install }}" }} path: arrow/r/vignettes/developers/script.sh diff --git a/dev/tasks/r/github.linux.arrow.version.back.compat.yml b/dev/tasks/r/github.linux.arrow.version.back.compat.yml index 086705dbb9cf4..90b2554eb8cd7 100644 --- a/dev/tasks/r/github.linux.arrow.version.back.compat.yml +++ b/dev/tasks/r/github.linux.arrow.version.back.compat.yml @@ -58,7 +58,7 @@ jobs: shell: bash - name: Upload the parquet artifacts - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: files path: arrow/r/extra-tests/files @@ -108,7 +108,7 @@ jobs: cp arrow/r/extra-tests/helper*.R extra-tests/ cp arrow/r/extra-tests/test-*.R extra-tests/ - name: Download artifacts - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v4 with: name: files path: extra-tests/files diff --git a/dev/tasks/r/github.linux.cran.yml b/dev/tasks/r/github.linux.cran.yml index 34cb4b9446a0b..8f56bf771d224 100644 --- a/dev/tasks/r/github.linux.cran.yml +++ b/dev/tasks/r/github.linux.cran.yml @@ -55,7 +55,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: - name: test-output + name: test-output-{{ "${{ matrix.r_image }}" }} path: arrow/r/check/arrow.Rcheck/tests/testthat.Rout* diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml index 9ac0ebc40835e..62cdaa02051dd 100644 --- a/dev/tasks/r/github.linux.offline.build.yml +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -41,7 +41,7 @@ jobs: R -e "source('R/install-arrow.R'); create_package_with_all_dependencies(dest_file = 'arrow_with_deps.tar.gz', source_file = \"${built_tar}\")" shell: bash - name: Upload the third party dependency artifacts - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: thirdparty_deps path: arrow/r/arrow_with_deps.tar.gz @@ -60,7 +60,7 @@ jobs: - uses: r-lib/actions/setup-r@v2 - name: Download artifacts - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v4 with: name: thirdparty_deps path: arrow/r/ @@ -91,7 +91,7 @@ jobs: run: cat arrow-tests/testthat.Rout* if: always() - name: Save the test output - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: test-output path: arrow-tests/testthat.Rout* diff --git a/dev/tasks/r/github.linux.versions.yml b/dev/tasks/r/github.linux.versions.yml index 753efe61d048e..092ac97de8ec4 100644 --- a/dev/tasks/r/github.linux.versions.yml +++ b/dev/tasks/r/github.linux.versions.yml @@ -55,7 +55,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: - name: test-output + name: test-output-{{ "${{ matrix.r_version }}" }} path: arrow/r/check/arrow.Rcheck/tests/testthat.Rout* diff --git a/dev/tasks/r/github.macos-linux.local.yml b/dev/tasks/r/github.macos-linux.local.yml index b221e8c5d8d5b..2db80f254fec5 100644 --- a/dev/tasks/r/github.macos-linux.local.yml +++ b/dev/tasks/r/github.macos-linux.local.yml @@ -97,8 +97,8 @@ jobs: run: cat arrow-tests/testthat.Rout* if: failure() - name: Save the test output - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: - name: test-output + name: test-output-{{ "${{ matrix.os }}" }} path: arrow-tests/testthat.Rout* if: always() diff --git a/dev/tasks/r/github.macos.cran.yml b/dev/tasks/r/github.macos.cran.yml index 33965988e213a..dda8ac7fd7850 100644 --- a/dev/tasks/r/github.macos.cran.yml +++ b/dev/tasks/r/github.macos.cran.yml @@ -75,7 +75,7 @@ jobs: run: cat arrow-tests/testthat.Rout* if: failure() - name: Save the test output - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: test-output path: arrow-tests/testthat.Rout* diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index db6955b92d1e0..66008275148f9 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -51,7 +51,7 @@ jobs: R CMD build --no-build-vignettes . - name: Upload package artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: r-pkg__src__contrib path: arrow/r/arrow_*.tar.gz @@ -106,7 +106,7 @@ jobs: cd arrow/r/libarrow/dist shasum -a 512 arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512 - name: Upload binary artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: r-lib__libarrow__bin__darwin-{{ '${{ matrix.platform.arch }}' }}-openssl-{{ '${{ matrix.openssl }}' }} path: arrow/r/libarrow/dist/arrow-*.zip* @@ -161,7 +161,7 @@ jobs: cd arrow/r/libarrow/dist shasum -a 512 arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512 - name: Upload binary artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: r-lib__libarrow__bin__linux-openssl-{{ '${{ matrix.openssl }}' }} path: arrow/r/libarrow/dist/arrow-*.zip* @@ -194,7 +194,7 @@ jobs: cd build sha512sum arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512 - name: Upload binary artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: r-lib__libarrow__bin__windows path: build/arrow-*.zip* @@ -291,7 +291,7 @@ jobs: cat(cmd, file = Sys.getenv("GITHUB_OUTPUT"), append = TRUE) - name: Upload binary artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: r-pkg{{ '${{ steps.build.outputs.path }}' }} path: arrow_* @@ -347,7 +347,7 @@ jobs: ' - name: Upload binary artifact if: matrix.config.devtoolset - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: r-pkg_centos7 path: arrow_* @@ -359,7 +359,7 @@ jobs: runs-on: ubuntu-latest container: "rstudio/r-base:4.2-centos7" steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: name: r-pkg_centos7 - name: Install DTS Package @@ -441,7 +441,7 @@ jobs: steps: {{ macros.github_checkout_arrow()|indent }} - name: Download Artifacts - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: path: artifacts - name: Install R From ed8585e50a7a5d3addfe1ee5afa70a1a4c714daa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 12 Sep 2024 18:09:34 +0200 Subject: [PATCH 126/130] GH-43840: [CI] Add cuda group to tasks.yml and minor updates for new cuda runner image (#43841) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Trigger cuda jobs as a group on crossbow ### What changes are included in this PR? Grouping of cuda tasks under `cuda` on tasks.yml. We have also updated the cuda runner image used to https://github.com/voltrondata-labs/cuda-action-runner-builder/blob/bc1797368e02d98e4dc04de8afe41807e2171f3c/.github/workflows/cuda-dind-runners.yaml. It will run on Ubuntu 22.04, base Python is updated to 3.10, CUDA updated from 11.4.1 to 11.8.0 ### Are these changes tested? Via archery ### Are there any user-facing changes? No * GitHub Issue: #43840 Authored-by: Raúl Cumplido Signed-off-by: Antoine Pitrou --- dev/tasks/docker-tests/github.cuda.yml | 8 ++++---- dev/tasks/tasks.yml | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/dev/tasks/docker-tests/github.cuda.yml b/dev/tasks/docker-tests/github.cuda.yml index d03b3657afc53..e65ac457b2ef7 100644 --- a/dev/tasks/docker-tests/github.cuda.yml +++ b/dev/tasks/docker-tests/github.cuda.yml @@ -26,13 +26,13 @@ jobs: runs-on: ['self-hosted', 'cuda'] {{ macros.github_set_env(env) }} timeout-minutes: {{ timeout|default(60) }} - env: - ARCHERY_USE_LEGACY_DOCKER_COMPOSE: 1 steps: {{ macros.github_checkout_arrow(fetch_depth=fetch_depth|default(1))|indent }} - # python 3.8 is installed on the runner, no need to install + # python 3.10 is installed on the runner, no need to install + - name: Install pip + run: sudo apt update && sudo apt install python3-pip -y - name: Install archery - run: python -m pip install -e arrow/dev/archery[docker] + run: python3 -m pip install -e arrow/dev/archery[docker] - name: Execute Docker Build shell: bash env: diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 9f13245c53f4a..9bb7eedd7b3ee 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -70,6 +70,9 @@ groups: {############################# Testing tasks #################################} + cuda: + - test-cuda-* + test: - test-* From a6b718eea6f8683ef3173415fefb4447d19a3fa7 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 12 Sep 2024 13:39:42 -0300 Subject: [PATCH 127/130] GH-42247: [C++] Support casting to and from utf8_view/binary_view (#43302) ### Rationale for this change We need casts between string (binary) and string-view (binary-view) types since they are semantically equivalent. ### What changes are included in this PR? - Add `is_binary_view_like()` type predicate - Add `BinaryViewTypes()` list including `STRING_VIEW/BINARY_VIEW` - New cast kernels ### Are these changes tested? Yes, but test coverage might be improved. ### Are there any user-facing changes? More casts are available. * GitHub Issue: #42247 Lead-authored-by: Felipe Oliveira Carvalho Co-authored-by: mwish Signed-off-by: Antoine Pitrou --- .../arrow/compute/kernels/codegen_internal.h | 19 +- .../compute/kernels/scalar_cast_boolean.cc | 6 + .../compute/kernels/scalar_cast_internal.cc | 7 +- .../compute/kernels/scalar_cast_numeric.cc | 24 +- .../compute/kernels/scalar_cast_string.cc | 289 +++++++++++++++++- .../arrow/compute/kernels/scalar_cast_test.cc | 146 ++++++--- cpp/src/arrow/type.cc | 12 +- cpp/src/arrow/type.h | 3 + cpp/src/arrow/type_test.cc | 2 + cpp/src/arrow/type_traits.h | 25 ++ cpp/src/arrow/util/binary_view_util.h | 13 + cpp/src/arrow/visit_data_inline.h | 3 +- 12 files changed, 473 insertions(+), 76 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 9e46a21887f8c..7f9be92f3a14b 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -133,7 +133,8 @@ struct GetViewType> { template struct GetViewType::value || - is_fixed_size_binary_type::value>> { + is_fixed_size_binary_type::value || + is_binary_view_like_type::value>> { using T = std::string_view; using PhysicalType = T; @@ -1265,6 +1266,22 @@ ArrayKernelExec GenerateVarBinary(detail::GetTypeId get_id) { } } +// Generate a kernel given a templated functor for binary-view types. Generates a +// single kernel for binary/string-view. +// +// See "Numeric" above for description of the generator functor +template