Skip to content

Commit

Permalink
GH-15058: [C++][Python] Native support for UUID (#37298)
Browse files Browse the repository at this point in the history
### Rationale for this change

See #15058.
UUID datatype is common in throughout the ecosystem and Arrow as supporting it as a native type would reduce friction.

### What changes are included in this PR?

This PR implements logic for Arrow canonical extension type in C++ and a Python wrapper.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes, new extension type is added.
* Closes: #15058

Authored-by: Rok Mihevc <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
rok authored Aug 26, 2024
1 parent 51e9f70 commit 2328b6e
Show file tree
Hide file tree
Showing 29 changed files with 412 additions and 132 deletions.
3 changes: 2 additions & 1 deletion cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ set(ARROW_SRCS
device.cc
extension_type.cc
extension/bool8.cc
extension/uuid.cc
pretty_print.cc
record_batch.cc
result.cc
Expand Down Expand Up @@ -1225,6 +1226,7 @@ add_subdirectory(testing)
add_subdirectory(array)
add_subdirectory(c)
add_subdirectory(compute)
add_subdirectory(extension)
add_subdirectory(io)
add_subdirectory(tensor)
add_subdirectory(util)
Expand Down Expand Up @@ -1267,7 +1269,6 @@ endif()

if(ARROW_JSON)
add_subdirectory(json)
add_subdirectory(extension)
endif()

if(ARROW_ORC)
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/acero/hash_join_node_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "arrow/compute/kernels/test_util.h"
#include "arrow/compute/light_array_internal.h"
#include "arrow/compute/row/row_encoder_internal.h"
#include "arrow/extension/uuid.h"
#include "arrow/testing/extension_type.h"
#include "arrow/testing/generator.h"
#include "arrow/testing/gtest_util.h"
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/extension/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.

set(CANONICAL_EXTENSION_TESTS bool8_test.cc)
set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc)

if(ARROW_JSON)
list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc)
Expand Down
17 changes: 2 additions & 15 deletions cpp/src/arrow/extension/fixed_shape_tensor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include "arrow/array/array_primitive.h"
#include "arrow/io/memory.h"
#include "arrow/ipc/reader.h"
#include "arrow/ipc/writer.h"
#include "arrow/ipc/test_common.h"
#include "arrow/record_batch.h"
#include "arrow/tensor.h"
#include "arrow/testing/gtest_util.h"
Expand All @@ -33,6 +33,7 @@
namespace arrow {

using FixedShapeTensorType = extension::FixedShapeTensorType;
using arrow::ipc::test::RoundtripBatch;
using extension::fixed_shape_tensor;
using extension::FixedShapeTensorArray;

Expand Down Expand Up @@ -71,20 +72,6 @@ class TestExtensionType : public ::testing::Test {
std::string serialized_;
};

auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
std::shared_ptr<RecordBatch>* out) {
ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
out_stream.get()));

ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());

io::BufferReader reader(complete_ipc_stream);
std::shared_ptr<RecordBatchReader> batch_reader;
ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
ASSERT_OK(batch_reader->ReadNext(out));
};

TEST_F(TestExtensionType, CheckDummyRegistration) {
// We need a registered dummy type at runtime to allow for IPC deserialization
auto registered_type = GetExtensionType("arrow.fixed_shape_tensor");
Expand Down
58 changes: 58 additions & 0 deletions cpp/src/arrow/extension/uuid.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <sstream>

#include "arrow/extension_type.h"
#include "arrow/util/logging.h"

#include "arrow/extension/uuid.h"

namespace arrow::extension {

bool UuidType::ExtensionEquals(const ExtensionType& other) const {
return (other.extension_name() == this->extension_name());
}

std::shared_ptr<Array> UuidType::MakeArray(std::shared_ptr<ArrayData> data) const {
DCHECK_EQ(data->type->id(), Type::EXTENSION);
DCHECK_EQ("arrow.uuid",
static_cast<const ExtensionType&>(*data->type).extension_name());
return std::make_shared<UuidArray>(data);
}

Result<std::shared_ptr<DataType>> UuidType::Deserialize(
std::shared_ptr<DataType> storage_type, const std::string& serialized) const {
if (!serialized.empty()) {
return Status::Invalid("Unexpected serialized metadata: '", serialized, "'");
}
if (!storage_type->Equals(*fixed_size_binary(16))) {
return Status::Invalid("Invalid storage type for UuidType: ",
storage_type->ToString());
}
return std::make_shared<UuidType>();
}

std::string UuidType::ToString(bool show_metadata) const {
std::stringstream ss;
ss << "extension<" << this->extension_name() << ">";
return ss.str();
}

std::shared_ptr<DataType> uuid() { return std::make_shared<UuidType>(); }

} // namespace arrow::extension
61 changes: 61 additions & 0 deletions cpp/src/arrow/extension/uuid.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/extension_type.h"

namespace arrow::extension {

/// \brief UuidArray stores array of UUIDs. Underlying storage type is
/// FixedSizeBinary(16).
class ARROW_EXPORT UuidArray : public ExtensionArray {
public:
using ExtensionArray::ExtensionArray;
};

/// \brief UuidType is a canonical arrow extension type for UUIDs.
/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this
/// does not interpret the bytes in any way. Specific UUID version is not
/// required or guaranteed.
class ARROW_EXPORT UuidType : public ExtensionType {
public:
/// \brief Construct a UuidType.
UuidType() : ExtensionType(fixed_size_binary(16)) {}

std::string extension_name() const override { return "arrow.uuid"; }
std::string ToString(bool show_metadata = false) const override;

bool ExtensionEquals(const ExtensionType& other) const override;

/// Create a UuidArray from ArrayData
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;

Result<std::shared_ptr<DataType>> Deserialize(
std::shared_ptr<DataType> storage_type,
const std::string& serialized) const override;

std::string Serialize() const override { return ""; }

/// \brief Create a UuidType instance
static Result<std::shared_ptr<DataType>> Make() { return std::make_shared<UuidType>(); }
};

/// \brief Return a UuidType instance.
ARROW_EXPORT std::shared_ptr<DataType> uuid();

} // namespace arrow::extension
72 changes: 72 additions & 0 deletions cpp/src/arrow/extension/uuid_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/extension/uuid.h"

#include "arrow/testing/matchers.h"

#include "arrow/io/memory.h"
#include "arrow/ipc/reader.h"
#include "arrow/ipc/test_common.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/util/key_value_metadata.h"

#include "arrow/testing/extension_type.h"

namespace arrow {

using arrow::ipc::test::RoundtripBatch;

TEST(TestUuuidExtensionType, ExtensionTypeTest) {
auto type = uuid();
ASSERT_EQ(type->id(), Type::EXTENSION);

const auto& ext_type = static_cast<const ExtensionType&>(*type);
std::string serialized = ext_type.Serialize();

ASSERT_OK_AND_ASSIGN(auto deserialized,
ext_type.Deserialize(fixed_size_binary(16), serialized));
ASSERT_TRUE(deserialized->Equals(*type));
ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16)));
}

TEST(TestUuuidExtensionType, RoundtripBatch) {
auto ext_type = extension::uuid();
auto exact_ext_type = internal::checked_pointer_cast<extension::UuidType>(ext_type);
auto arr = ArrayFromJSON(fixed_size_binary(16), R"(["abcdefghijklmnop", null])");
auto ext_arr = ExtensionType::WrapArray(ext_type, arr);

// Pass extension array, expect getting back extension array
std::shared_ptr<RecordBatch> read_batch;
auto ext_field = field(/*name=*/"f0", /*type=*/ext_type);
auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr});
RoundtripBatch(batch, &read_batch);
CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);

// Pass extension metadata and storage array, expect getting back extension array
std::shared_ptr<RecordBatch> read_batch2;
auto ext_metadata =
key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()},
{"ARROW:extension:metadata", ""}});
ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(),
/*nullable=*/true, /*metadata=*/ext_metadata);
auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr});
RoundtripBatch(batch2, &read_batch2);
CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true);
}

} // namespace arrow
4 changes: 2 additions & 2 deletions cpp/src/arrow/extension_type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "arrow/extension/fixed_shape_tensor.h"
#include "arrow/extension/opaque.h"
#endif
#include "arrow/extension/uuid.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
Expand Down Expand Up @@ -147,14 +148,13 @@ static void CreateGlobalRegistry() {
// Register canonical extension types

g_registry = std::make_shared<ExtensionTypeRegistryImpl>();
std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8()};
std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8(), extension::uuid()};

#ifdef ARROW_JSON
ext_types.push_back(extension::fixed_shape_tensor(int64(), {}));
ext_types.push_back(extension::opaque(null(), "", ""));
#endif

// Register canonical extension types
for (const auto& ext_type : ext_types) {
ARROW_CHECK_OK(
g_registry->RegisterType(checked_pointer_cast<ExtensionType>(ext_type)));
Expand Down
19 changes: 4 additions & 15 deletions cpp/src/arrow/extension_type_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "arrow/io/memory.h"
#include "arrow/ipc/options.h"
#include "arrow/ipc/reader.h"
#include "arrow/ipc/test_common.h"
#include "arrow/ipc/writer.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
Expand All @@ -41,6 +42,8 @@

namespace arrow {

using arrow::ipc::test::RoundtripBatch;

class Parametric1Array : public ExtensionArray {
public:
using ExtensionArray::ExtensionArray;
Expand Down Expand Up @@ -178,7 +181,7 @@ class ExtStructType : public ExtensionType {

class TestExtensionType : public ::testing::Test {
public:
void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared<UuidType>())); }
void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared<ExampleUuidType>())); }

void TearDown() {
if (GetExtensionType("uuid")) {
Expand Down Expand Up @@ -211,20 +214,6 @@ TEST_F(TestExtensionType, ExtensionTypeTest) {
ASSERT_EQ(deserialized->byte_width(), 16);
}

auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
std::shared_ptr<RecordBatch>* out) {
ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
out_stream.get()));

ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());

io::BufferReader reader(complete_ipc_stream);
std::shared_ptr<RecordBatchReader> batch_reader;
ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
ASSERT_OK(batch_reader->ReadNext(out));
};

TEST_F(TestExtensionType, IpcRoundtrip) {
auto ext_arr = ExampleUuid();
auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr});
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/integration/json_integration_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@ TEST(TestJsonFileReadWrite, JsonExample2) {

auto storage_array =
ArrayFromJSON(fixed_size_binary(16), R"(["0123456789abcdef", null])");
AssertArraysEqual(*batch->column(0), UuidArray(uuid_type, storage_array));
AssertArraysEqual(*batch->column(0), ExampleUuidArray(uuid_type, storage_array));

AssertArraysEqual(*batch->column(1), NullArray(2));
}
Expand Down
Loading

0 comments on commit 2328b6e

Please sign in to comment.