-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
### Rationale for this change See #15058. UUID datatype is common in throughout the ecosystem and Arrow as supporting it as a native type would reduce friction. ### What changes are included in this PR? This PR implements logic for Arrow canonical extension type in C++ and a Python wrapper. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes, new extension type is added. * Closes: #15058 Authored-by: Rok Mihevc <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
- Loading branch information
Showing
29 changed files
with
412 additions
and
132 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include <sstream> | ||
|
||
#include "arrow/extension_type.h" | ||
#include "arrow/util/logging.h" | ||
|
||
#include "arrow/extension/uuid.h" | ||
|
||
namespace arrow::extension { | ||
|
||
bool UuidType::ExtensionEquals(const ExtensionType& other) const { | ||
return (other.extension_name() == this->extension_name()); | ||
} | ||
|
||
std::shared_ptr<Array> UuidType::MakeArray(std::shared_ptr<ArrayData> data) const { | ||
DCHECK_EQ(data->type->id(), Type::EXTENSION); | ||
DCHECK_EQ("arrow.uuid", | ||
static_cast<const ExtensionType&>(*data->type).extension_name()); | ||
return std::make_shared<UuidArray>(data); | ||
} | ||
|
||
Result<std::shared_ptr<DataType>> UuidType::Deserialize( | ||
std::shared_ptr<DataType> storage_type, const std::string& serialized) const { | ||
if (!serialized.empty()) { | ||
return Status::Invalid("Unexpected serialized metadata: '", serialized, "'"); | ||
} | ||
if (!storage_type->Equals(*fixed_size_binary(16))) { | ||
return Status::Invalid("Invalid storage type for UuidType: ", | ||
storage_type->ToString()); | ||
} | ||
return std::make_shared<UuidType>(); | ||
} | ||
|
||
std::string UuidType::ToString(bool show_metadata) const { | ||
std::stringstream ss; | ||
ss << "extension<" << this->extension_name() << ">"; | ||
return ss.str(); | ||
} | ||
|
||
std::shared_ptr<DataType> uuid() { return std::make_shared<UuidType>(); } | ||
|
||
} // namespace arrow::extension |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#pragma once | ||
|
||
#include "arrow/extension_type.h" | ||
|
||
namespace arrow::extension { | ||
|
||
/// \brief UuidArray stores array of UUIDs. Underlying storage type is | ||
/// FixedSizeBinary(16). | ||
class ARROW_EXPORT UuidArray : public ExtensionArray { | ||
public: | ||
using ExtensionArray::ExtensionArray; | ||
}; | ||
|
||
/// \brief UuidType is a canonical arrow extension type for UUIDs. | ||
/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this | ||
/// does not interpret the bytes in any way. Specific UUID version is not | ||
/// required or guaranteed. | ||
class ARROW_EXPORT UuidType : public ExtensionType { | ||
public: | ||
/// \brief Construct a UuidType. | ||
UuidType() : ExtensionType(fixed_size_binary(16)) {} | ||
|
||
std::string extension_name() const override { return "arrow.uuid"; } | ||
std::string ToString(bool show_metadata = false) const override; | ||
|
||
bool ExtensionEquals(const ExtensionType& other) const override; | ||
|
||
/// Create a UuidArray from ArrayData | ||
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override; | ||
|
||
Result<std::shared_ptr<DataType>> Deserialize( | ||
std::shared_ptr<DataType> storage_type, | ||
const std::string& serialized) const override; | ||
|
||
std::string Serialize() const override { return ""; } | ||
|
||
/// \brief Create a UuidType instance | ||
static Result<std::shared_ptr<DataType>> Make() { return std::make_shared<UuidType>(); } | ||
}; | ||
|
||
/// \brief Return a UuidType instance. | ||
ARROW_EXPORT std::shared_ptr<DataType> uuid(); | ||
|
||
} // namespace arrow::extension |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/extension/uuid.h" | ||
|
||
#include "arrow/testing/matchers.h" | ||
|
||
#include "arrow/io/memory.h" | ||
#include "arrow/ipc/reader.h" | ||
#include "arrow/ipc/test_common.h" | ||
#include "arrow/testing/gtest_util.h" | ||
#include "arrow/util/key_value_metadata.h" | ||
|
||
#include "arrow/testing/extension_type.h" | ||
|
||
namespace arrow { | ||
|
||
using arrow::ipc::test::RoundtripBatch; | ||
|
||
TEST(TestUuuidExtensionType, ExtensionTypeTest) { | ||
auto type = uuid(); | ||
ASSERT_EQ(type->id(), Type::EXTENSION); | ||
|
||
const auto& ext_type = static_cast<const ExtensionType&>(*type); | ||
std::string serialized = ext_type.Serialize(); | ||
|
||
ASSERT_OK_AND_ASSIGN(auto deserialized, | ||
ext_type.Deserialize(fixed_size_binary(16), serialized)); | ||
ASSERT_TRUE(deserialized->Equals(*type)); | ||
ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16))); | ||
} | ||
|
||
TEST(TestUuuidExtensionType, RoundtripBatch) { | ||
auto ext_type = extension::uuid(); | ||
auto exact_ext_type = internal::checked_pointer_cast<extension::UuidType>(ext_type); | ||
auto arr = ArrayFromJSON(fixed_size_binary(16), R"(["abcdefghijklmnop", null])"); | ||
auto ext_arr = ExtensionType::WrapArray(ext_type, arr); | ||
|
||
// Pass extension array, expect getting back extension array | ||
std::shared_ptr<RecordBatch> read_batch; | ||
auto ext_field = field(/*name=*/"f0", /*type=*/ext_type); | ||
auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); | ||
RoundtripBatch(batch, &read_batch); | ||
CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); | ||
|
||
// Pass extension metadata and storage array, expect getting back extension array | ||
std::shared_ptr<RecordBatch> read_batch2; | ||
auto ext_metadata = | ||
key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, | ||
{"ARROW:extension:metadata", ""}}); | ||
ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(), | ||
/*nullable=*/true, /*metadata=*/ext_metadata); | ||
auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr}); | ||
RoundtripBatch(batch2, &read_batch2); | ||
CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); | ||
} | ||
|
||
} // namespace arrow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.