forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
apacheGH-32538: [C++][Parquet] Add JSON canonical extension type (apa…
…che#13901) Arrow now provides a canonical extension type for JSON data. This extension is backed by utf8(). Parquet will recognize this extension and appropriately propagate the LogicalType to the storage format. * GitHub Issue: apache#32538 Lead-authored-by: Rok Mihevc <[email protected]> Co-authored-by: Pradeep Gollakota <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Co-authored-by: mwish <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
- Loading branch information
Showing
20 changed files
with
460 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/extension/json.h" | ||
|
||
#include <string> | ||
|
||
#include "arrow/extension_type.h" | ||
#include "arrow/result.h" | ||
#include "arrow/status.h" | ||
#include "arrow/type_fwd.h" | ||
#include "arrow/util/logging.h" | ||
|
||
namespace arrow::extension { | ||
|
||
bool JsonExtensionType::ExtensionEquals(const ExtensionType& other) const { | ||
return other.extension_name() == this->extension_name(); | ||
} | ||
|
||
Result<std::shared_ptr<DataType>> JsonExtensionType::Deserialize( | ||
std::shared_ptr<DataType> storage_type, const std::string& serialized) const { | ||
if (storage_type->id() != Type::STRING && storage_type->id() != Type::STRING_VIEW && | ||
storage_type->id() != Type::LARGE_STRING) { | ||
return Status::Invalid("Invalid storage type for JsonExtensionType: ", | ||
storage_type->ToString()); | ||
} | ||
return std::make_shared<JsonExtensionType>(storage_type); | ||
} | ||
|
||
std::string JsonExtensionType::Serialize() const { return ""; } | ||
|
||
std::shared_ptr<Array> JsonExtensionType::MakeArray( | ||
std::shared_ptr<ArrayData> data) const { | ||
DCHECK_EQ(data->type->id(), Type::EXTENSION); | ||
DCHECK_EQ("arrow.json", | ||
internal::checked_cast<const ExtensionType&>(*data->type).extension_name()); | ||
return std::make_shared<ExtensionArray>(data); | ||
} | ||
|
||
std::shared_ptr<DataType> json(const std::shared_ptr<DataType> storage_type) { | ||
ARROW_CHECK(storage_type->id() != Type::STRING || | ||
storage_type->id() != Type::STRING_VIEW || | ||
storage_type->id() != Type::LARGE_STRING); | ||
return std::make_shared<JsonExtensionType>(storage_type); | ||
} | ||
|
||
} // namespace arrow::extension |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#pragma once | ||
|
||
#include <stdexcept> | ||
#include <string> | ||
|
||
#include "arrow/extension_type.h" | ||
#include "arrow/result.h" | ||
#include "arrow/type_fwd.h" | ||
#include "arrow/util/visibility.h" | ||
|
||
namespace arrow::extension { | ||
|
||
/// \brief Concrete type class for variable-size JSON data, utf8-encoded. | ||
class ARROW_EXPORT JsonExtensionType : public ExtensionType { | ||
public: | ||
explicit JsonExtensionType(const std::shared_ptr<DataType>& storage_type) | ||
: ExtensionType(storage_type), storage_type_(storage_type) {} | ||
|
||
std::string extension_name() const override { return "arrow.json"; } | ||
|
||
bool ExtensionEquals(const ExtensionType& other) const override; | ||
|
||
Result<std::shared_ptr<DataType>> Deserialize( | ||
std::shared_ptr<DataType> storage_type, | ||
const std::string& serialized_data) const override; | ||
|
||
std::string Serialize() const override; | ||
|
||
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override; | ||
|
||
private: | ||
std::shared_ptr<DataType> storage_type_; | ||
}; | ||
|
||
/// \brief Return a JsonExtensionType instance. | ||
ARROW_EXPORT std::shared_ptr<DataType> json( | ||
std::shared_ptr<DataType> storage_type = utf8()); | ||
|
||
} // namespace arrow::extension |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/extension/json.h" | ||
|
||
#include "arrow/array/validate.h" | ||
#include "arrow/ipc/test_common.h" | ||
#include "arrow/record_batch.h" | ||
#include "arrow/testing/gtest_util.h" | ||
#include "parquet/exception.h" | ||
|
||
namespace arrow { | ||
|
||
using arrow::ipc::test::RoundtripBatch; | ||
using extension::json; | ||
|
||
class TestJsonExtensionType : public ::testing::Test {}; | ||
|
||
std::shared_ptr<Array> ExampleJson(const std::shared_ptr<DataType>& storage_type) { | ||
std::shared_ptr<Array> arr = ArrayFromJSON(storage_type, R"([ | ||
"null", | ||
"1234", | ||
"3.14159", | ||
"true", | ||
"false", | ||
"\"a json string\"", | ||
"[\"a\", \"json\", \"array\"]", | ||
"{\"obj\": \"a simple json object\"}" | ||
])"); | ||
return ExtensionType::WrapArray(arrow::extension::json(storage_type), arr); | ||
} | ||
|
||
TEST_F(TestJsonExtensionType, JsonRoundtrip) { | ||
for (const auto& storage_type : {utf8(), large_utf8(), utf8_view()}) { | ||
std::shared_ptr<Array> ext_arr = ExampleJson(storage_type); | ||
auto batch = | ||
RecordBatch::Make(schema({field("f0", json(storage_type))}), 8, {ext_arr}); | ||
|
||
std::shared_ptr<RecordBatch> read_batch; | ||
ASSERT_OK(RoundtripBatch(batch, &read_batch)); | ||
ASSERT_OK(read_batch->ValidateFull()); | ||
CompareBatch(*batch, *read_batch, /*compare_metadata*/ true); | ||
|
||
auto read_ext_arr = read_batch->column(0); | ||
ASSERT_OK(internal::ValidateUTF8(*read_ext_arr)); | ||
ASSERT_OK(read_ext_arr->ValidateFull()); | ||
} | ||
} | ||
|
||
TEST_F(TestJsonExtensionType, InvalidUTF8) { | ||
for (const auto& storage_type : {utf8(), large_utf8(), utf8_view()}) { | ||
auto json_type = json(storage_type); | ||
auto invalid_input = ArrayFromJSON(storage_type, "[\"Ⱥa\xFFⱭ\", \"Ɽ\xe1\xbdⱤaA\"]"); | ||
auto ext_arr = ExtensionType::WrapArray(json_type, invalid_input); | ||
|
||
ASSERT_RAISES_WITH_MESSAGE(Invalid, | ||
"Invalid: Invalid UTF8 sequence at string index 0", | ||
ext_arr->ValidateFull()); | ||
ASSERT_RAISES_WITH_MESSAGE(Invalid, | ||
"Invalid: Invalid UTF8 sequence at string index 0", | ||
arrow::internal::ValidateUTF8(*ext_arr)); | ||
|
||
auto batch = RecordBatch::Make(schema({field("f0", json_type)}), 2, {ext_arr}); | ||
std::shared_ptr<RecordBatch> read_batch; | ||
ASSERT_OK(RoundtripBatch(batch, &read_batch)); | ||
} | ||
} | ||
|
||
} // namespace arrow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.