From 0dd11098ec9aa4648061cf654cb992a4e6df99bb Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 25 Mar 2016 14:22:01 +0100 Subject: [PATCH 1/7] ARROW-78: Add constructor for DecimalType --- cpp/CMakeLists.txt | 1 + cpp/src/arrow/types/decimal.cc | 32 ++++++++++++++++++++++++++++++++ cpp/src/arrow/types/decimal.h | 11 +++++++++++ 3 files changed, 44 insertions(+) create mode 100644 cpp/src/arrow/types/decimal.cc diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6d701079b482c..e7d332b2f4914 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -528,6 +528,7 @@ set(ARROW_SRCS src/arrow/ipc/metadata-internal.cc src/arrow/types/construct.cc + src/arrow/types/decimal.cc src/arrow/types/json.cc src/arrow/types/list.cc src/arrow/types/primitive.cc diff --git a/cpp/src/arrow/types/decimal.cc b/cpp/src/arrow/types/decimal.cc new file mode 100644 index 0000000000000..f120c1a9dfde6 --- /dev/null +++ b/cpp/src/arrow/types/decimal.cc @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/decimal.h" + +#include +#include + +namespace arrow { + +std::string DecimalType::ToString() const { + std::stringstream s; + s << "decimal(" << precision << ", " << scale << ")"; + return s.str(); +} + +} // namespace arrow + diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 464c3ff8da92b..26243b42b0e7d 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -18,13 +18,24 @@ #ifndef ARROW_TYPES_DECIMAL_H #define ARROW_TYPES_DECIMAL_H +#include + #include "arrow/type.h" namespace arrow { struct DecimalType : public DataType { + explicit DecimalType(int precision_, int scale_) + : DataType(Type::DECIMAL), precision(precision_), + scale(scale_) { } int precision; int scale; + + static char const *name() { + return "decimal"; + } + + std::string ToString() const override; }; } // namespace arrow From 0e2a7f1942574f6b03a1cb9e3c18ef570fcdcd4d Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 25 Mar 2016 18:37:26 +0100 Subject: [PATCH 2/7] Add macro for adding dependencies to tests --- cpp/CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e7d332b2f4914..6ed2768d13918 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -378,6 +378,16 @@ function(ADD_ARROW_TEST_DEPENDENCIES REL_TEST_NAME) add_dependencies(${TEST_NAME} ${ARGN}) endfunction() +# A wrapper for target_link_libraries() that is compatible with NO_TESTS. +function(ARROW_TEST_LINK_LIBRARIES REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + target_link_libraries(${TEST_NAME} ${ARGN}) +endfunction() + enable_testing() ############################################################ From b7b9ca94b2a6df696f2a76392ce59495a4c2df73 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 25 Mar 2016 19:02:24 +0100 Subject: [PATCH 3/7] Add basic conversion for primitive types --- cpp/src/arrow/parquet/CMakeLists.txt | 6 + cpp/src/arrow/parquet/parquet-schema-test.cc | 91 +++++++++++++++ cpp/src/arrow/parquet/schema.cc | 114 +++++++++++++++++++ cpp/src/arrow/parquet/schema.h | 40 +++++++ 4 files changed, 251 insertions(+) create mode 100644 cpp/src/arrow/parquet/parquet-schema-test.cc create mode 100644 cpp/src/arrow/parquet/schema.cc create mode 100644 cpp/src/arrow/parquet/schema.h diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index 7b449affab025..f9479900bb135 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -19,9 +19,12 @@ # arrow_parquet : Arrow <-> Parquet adapter set(PARQUET_SRCS + schema.cc ) set(PARQUET_LIBS + arrow + ${PARQUET_SHARED_LIB} ) add_library(arrow_parquet STATIC @@ -30,6 +33,9 @@ add_library(arrow_parquet STATIC target_link_libraries(arrow_parquet ${PARQUET_LIBS}) SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX) +ADD_ARROW_TEST(parquet-schema-test) +ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet) + # Headers: top level install(FILES DESTINATION include/arrow/parquet) diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc new file mode 100644 index 0000000000000..e562df9c57704 --- /dev/null +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include "arrow/parquet/schema.h" + +namespace arrow { + +namespace parquet { + +using parquet_cpp::Repetition; +using parquet_cpp::schema::NodePtr; +using parquet_cpp::schema::PrimitiveNode; + +TEST(TestNodeConversion, Primitive) { + NodePtr node = PrimitiveNode::Make("boolean", Repetition::REQUIRED, + parquet_cpp::Type::BOOLEAN); + std::shared_ptr field = NodeToField(node); + ASSERT_EQ(field->name, "boolean"); + ASSERT_TRUE(field->type->Equals(std::make_shared())); + ASSERT_FALSE(field->nullable); + + node = PrimitiveNode::Make("int32", Repetition::REQUIRED, parquet_cpp::Type::INT32); + field = NodeToField(node); + ASSERT_EQ(field->name, "int32"); + ASSERT_TRUE(field->type->Equals(std::make_shared())); + ASSERT_FALSE(field->nullable); + + node = PrimitiveNode::Make("int64", Repetition::REQUIRED, parquet_cpp::Type::INT64); + field = NodeToField(node); + ASSERT_EQ(field->name, "int64"); + ASSERT_TRUE(field->type->Equals(std::make_shared())); + ASSERT_FALSE(field->nullable); + + // case parquet_cpp::Type::INT96: + // TODO: Implement! + // node = PrimitiveNode::Make("int96", Repetition::REQUIRED, parquet_cpp::Type::INT96); + // field = NodeToField(node); + // TODO: Assertions + + // case parquet_cpp::Type::FLOAT: + node = PrimitiveNode::Make("float", Repetition::REQUIRED, parquet_cpp::Type::FLOAT); + field = NodeToField(node); + ASSERT_EQ(field->name, "float"); + ASSERT_TRUE(field->type->Equals(std::make_shared())); + ASSERT_FALSE(field->nullable); + + // case parquet_cpp::Type::DOUBLE: + node = PrimitiveNode::Make("double", Repetition::REQUIRED, parquet_cpp::Type::DOUBLE); + field = NodeToField(node); + ASSERT_EQ(field->name, "double"); + ASSERT_TRUE(field->type->Equals(std::make_shared())); + ASSERT_FALSE(field->nullable); + + // TODO: Implement! + // node = PrimitiveNode::Make("byte_array", Repetition::REQUIRED, + // parquet_cpp::Type::BYTE_ARRAY); + // field = NodeToField(node); + // TODO: Assertions + + // TODO: Implement! + // node = PrimitiveNode::Make("fixed_len_byte_array", Repetition::REQUIRED, + // parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY); + // field = NodeToField(node); + // TODO: Assertions +} + +TEST(TestNodeConversion, Logical) { +} + +TEST(TestSchemaConversion, Basics) { +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc new file mode 100644 index 0000000000000..3ac169d2fe1dc --- /dev/null +++ b/cpp/src/arrow/parquet/schema.cc @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/parquet/schema.h" +#include "arrow/types/decimal.h" + +using parquet_cpp::schema::Node; +using parquet_cpp::schema::NodePtr; +using parquet_cpp::schema::GroupNode; +using parquet_cpp::schema::PrimitiveNode; + +namespace arrow { + +namespace parquet { + + +TypePtr MakeDecimalType(const PrimitiveNode* node) { + int precision = node->decimal_metadata().precision; + int scale = node->decimal_metadata().scale; + return TypePtr(new DecimalType(precision, scale)); +} + +// TODO: Logical Type Handling +std::shared_ptr NodeToField(const NodePtr& node) { + TypePtr type; + + if (node->is_group()) { + const GroupNode* group = static_cast(node.get()); + std::vector> fields; + for (int i = 0; i < group->field_count(); i++) { + fields.push_back(NodeToField(group->field(i))); + } + type = TypePtr(new StructType(fields)); + } else { + // Primitive (leaf) node + const PrimitiveNode* primitive = static_cast(node.get()); + + switch (primitive->physical_type()) { + case parquet_cpp::Type::BOOLEAN: + type = TypePtr(new BooleanType()); + break; + case parquet_cpp::Type::INT32: + type = TypePtr(new Int32Type()); + break; + case parquet_cpp::Type::INT64: + type = TypePtr(new Int64Type()); + break; + case parquet_cpp::Type::INT96: + // TODO: Do we have that type in Arrow? + // type = TypePtr(new Int96Type()); + break; + case parquet_cpp::Type::FLOAT: + type = TypePtr(new FloatType()); + break; + case parquet_cpp::Type::DOUBLE: + type = TypePtr(new DoubleType()); + break; + case parquet_cpp::Type::BYTE_ARRAY: + // TODO: Do we have that type in Arrow? + // type = TypePtr(new Int96Type()); + break; + case parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY: + switch (primitive->logical_type()) { + case parquet_cpp::LogicalType::DECIMAL: + type = MakeDecimalType(primitive); + break; + default: + // TODO: Do we have that type in Arrow? + break; + } + break; + } + } + + if (node->is_repeated()) { + type = TypePtr(new ListType(type)); + } + + return std::shared_ptr(new Field(node->name(), type, !node->is_required())); +} + +std::shared_ptr FromParquetSchema( + const parquet_cpp::SchemaDescriptor* parquet_schema) { + std::vector> fields; + const GroupNode* schema_node = static_cast( + parquet_schema->schema().get()); + + // TODO: What to with the head node? + for (int i = 0; i < schema_node->field_count(); i++) { + fields.push_back(NodeToField(schema_node->field(i))); + } + + return std::shared_ptr(new Schema(fields)); +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h new file mode 100644 index 0000000000000..0071681656b8c --- /dev/null +++ b/cpp/src/arrow/parquet/schema.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PARQUET_SCHEMA_H +#define ARROW_PARQUET_SCHEMA_H + +#include +#include +#include +#include + +#include + +namespace arrow { + +namespace parquet { + +std::shared_ptr NodeToField(const parquet_cpp::schema::NodePtr& node); +std::shared_ptr FromParquetSchema( + const parquet_cpp::SchemaDescriptor* parquet_schema); + +} // namespace parquet + +} // namespace arrow + +#endif From 74d6bae3b518c64125797ca0ff8e9b81650010c0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 25 Mar 2016 22:32:42 -0700 Subject: [PATCH 4/7] Convert BYTE_ARRAY to StringType or List depending on the logical type --- cpp/src/arrow/parquet/CMakeLists.txt | 8 +- cpp/src/arrow/parquet/parquet-schema-test.cc | 49 +++++++++-- cpp/src/arrow/parquet/schema.cc | 90 ++++++++++++++------ cpp/src/arrow/parquet/schema.h | 20 +++-- 4 files changed, 124 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index f9479900bb135..0d5cf263ec3e2 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -19,15 +19,15 @@ # arrow_parquet : Arrow <-> Parquet adapter set(PARQUET_SRCS - schema.cc + schema.cc ) set(PARQUET_LIBS - arrow - ${PARQUET_SHARED_LIB} + arrow + ${PARQUET_SHARED_LIB} ) -add_library(arrow_parquet STATIC +add_library(arrow_parquet SHARED ${PARQUET_SRCS} ) target_link_libraries(arrow_parquet ${PARQUET_LIBS}) diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc index e562df9c57704..4debb2964edb3 100644 --- a/cpp/src/arrow/parquet/parquet-schema-test.cc +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -19,6 +19,9 @@ #include "arrow/parquet/schema.h" +#include "arrow/test-util.h" +#include "arrow/util/status.h" + namespace arrow { namespace parquet { @@ -28,21 +31,24 @@ using parquet_cpp::schema::NodePtr; using parquet_cpp::schema::PrimitiveNode; TEST(TestNodeConversion, Primitive) { + std::shared_ptr field; + NodePtr node = PrimitiveNode::Make("boolean", Repetition::REQUIRED, parquet_cpp::Type::BOOLEAN); - std::shared_ptr field = NodeToField(node); + + ASSERT_OK(NodeToField(node, &field)); ASSERT_EQ(field->name, "boolean"); ASSERT_TRUE(field->type->Equals(std::make_shared())); ASSERT_FALSE(field->nullable); node = PrimitiveNode::Make("int32", Repetition::REQUIRED, parquet_cpp::Type::INT32); - field = NodeToField(node); + ASSERT_OK(NodeToField(node, &field)); ASSERT_EQ(field->name, "int32"); ASSERT_TRUE(field->type->Equals(std::make_shared())); ASSERT_FALSE(field->nullable); node = PrimitiveNode::Make("int64", Repetition::REQUIRED, parquet_cpp::Type::INT64); - field = NodeToField(node); + ASSERT_OK(NodeToField(node, &field)); ASSERT_EQ(field->name, "int64"); ASSERT_TRUE(field->type->Equals(std::make_shared())); ASSERT_FALSE(field->nullable); @@ -55,14 +61,14 @@ TEST(TestNodeConversion, Primitive) { // case parquet_cpp::Type::FLOAT: node = PrimitiveNode::Make("float", Repetition::REQUIRED, parquet_cpp::Type::FLOAT); - field = NodeToField(node); + ASSERT_OK(NodeToField(node, &field)); ASSERT_EQ(field->name, "float"); ASSERT_TRUE(field->type->Equals(std::make_shared())); ASSERT_FALSE(field->nullable); // case parquet_cpp::Type::DOUBLE: node = PrimitiveNode::Make("double", Repetition::REQUIRED, parquet_cpp::Type::DOUBLE); - field = NodeToField(node); + ASSERT_OK(NodeToField(node, &field)); ASSERT_EQ(field->name, "double"); ASSERT_TRUE(field->type->Equals(std::make_shared())); ASSERT_FALSE(field->nullable); @@ -80,6 +86,39 @@ TEST(TestNodeConversion, Primitive) { // TODO: Assertions } +const auto UINT8 = std::make_shared(); + +TEST(TestNodeConversion, Int96Timestamp) { +} + +TEST(TestNodeConversion, ByteArray) { + std::shared_ptr field; + + NodePtr node = PrimitiveNode::Make("field0", Repetition::OPTIONAL, + parquet_cpp::Type::BYTE_ARRAY); + ASSERT_OK(NodeToField(node, &field)); + + std::shared_ptr ex_type = std::make_shared( + std::make_shared("", UINT8)); + + ASSERT_EQ(field->name, "field0"); + ASSERT_TRUE(field->type->Equals(ex_type)); + ASSERT_TRUE(field->nullable); + + node = PrimitiveNode::Make("field1", Repetition::OPTIONAL, + parquet_cpp::Type::BYTE_ARRAY, + parquet_cpp::LogicalType::UTF8); + ASSERT_OK(NodeToField(node, &field)); + ex_type = std::make_shared(); + + ASSERT_EQ(field->name, "field1"); + ASSERT_TRUE(field->type->Equals(ex_type)); + ASSERT_TRUE(field->nullable); +} + +TEST(TestNodeConversion, FixedLenByteArray) { +} + TEST(TestNodeConversion, Logical) { } diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index 3ac169d2fe1dc..63f7e47bf73bc 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -15,9 +15,13 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/parquet/schema.h" + #include -#include "arrow/parquet/schema.h" +#include "parquet/api/schema.h" + +#include "arrow/util/status.h" #include "arrow/types/decimal.h" using parquet_cpp::schema::Node; @@ -25,65 +29,95 @@ using parquet_cpp::schema::NodePtr; using parquet_cpp::schema::GroupNode; using parquet_cpp::schema::PrimitiveNode; +using parquet_cpp::LogicalType; + namespace arrow { namespace parquet { +const auto BOOL = std::make_shared(); +const auto UINT8 = std::make_shared(); +const auto INT32 = std::make_shared(); +const auto INT64 = std::make_shared(); +const auto FLOAT = std::make_shared(); +const auto DOUBLE = std::make_shared(); +const auto UTF8 = std::make_shared(); +const auto BINARY = std::make_shared( + std::make_shared("", UINT8)); TypePtr MakeDecimalType(const PrimitiveNode* node) { int precision = node->decimal_metadata().precision; int scale = node->decimal_metadata().scale; - return TypePtr(new DecimalType(precision, scale)); + return std::make_shared(precision, scale); +} + +static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::UTF8: + *out = UTF8; + break; + default: + // BINARY + *out = BINARY; + break; + } + return Status::OK(); +} + +static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::DECIMAL: + *out = MakeDecimalType(node); + break; + default: + return Status::NotImplemented("unhandled type"); + break; + } + + return Status::OK(); } // TODO: Logical Type Handling -std::shared_ptr NodeToField(const NodePtr& node) { +Status NodeToField(const NodePtr& node, std::shared_ptr* out) { TypePtr type; if (node->is_group()) { const GroupNode* group = static_cast(node.get()); - std::vector> fields; + std::vector> fields(group->field_count()); for (int i = 0; i < group->field_count(); i++) { - fields.push_back(NodeToField(group->field(i))); + RETURN_NOT_OK(NodeToField(group->field(i), &fields[i])); } - type = TypePtr(new StructType(fields)); + type = std::make_shared(fields); } else { // Primitive (leaf) node const PrimitiveNode* primitive = static_cast(node.get()); switch (primitive->physical_type()) { case parquet_cpp::Type::BOOLEAN: - type = TypePtr(new BooleanType()); + type = BOOL; break; case parquet_cpp::Type::INT32: - type = TypePtr(new Int32Type()); + type = INT32; break; case parquet_cpp::Type::INT64: - type = TypePtr(new Int64Type()); + type = INT64; break; case parquet_cpp::Type::INT96: // TODO: Do we have that type in Arrow? // type = TypePtr(new Int96Type()); - break; + return Status::NotImplemented("int96"); case parquet_cpp::Type::FLOAT: - type = TypePtr(new FloatType()); + type = FLOAT; break; case parquet_cpp::Type::DOUBLE: - type = TypePtr(new DoubleType()); + type = DOUBLE; break; case parquet_cpp::Type::BYTE_ARRAY: // TODO: Do we have that type in Arrow? - // type = TypePtr(new Int96Type()); + RETURN_NOT_OK(FromByteArray(primitive, &type)); break; case parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY: - switch (primitive->logical_type()) { - case parquet_cpp::LogicalType::DECIMAL: - type = MakeDecimalType(primitive); - break; - default: - // TODO: Do we have that type in Arrow? - break; - } + RETURN_NOT_OK(FromFLBA(primitive, &type)); break; } } @@ -92,21 +126,25 @@ std::shared_ptr NodeToField(const NodePtr& node) { type = TypePtr(new ListType(type)); } - return std::shared_ptr(new Field(node->name(), type, !node->is_required())); + *out = std::make_shared(node->name(), type, !node->is_required()); + + return Status::OK(); } -std::shared_ptr FromParquetSchema( - const parquet_cpp::SchemaDescriptor* parquet_schema) { +Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema, + std::shared_ptr* out) { std::vector> fields; const GroupNode* schema_node = static_cast( parquet_schema->schema().get()); // TODO: What to with the head node? + fields.resize(schema_node->field_count()); for (int i = 0; i < schema_node->field_count(); i++) { - fields.push_back(NodeToField(schema_node->field(i))); + RETURN_NOT_OK(NodeToField(schema_node->field(i), &fields[i])); } - return std::shared_ptr(new Schema(fields)); + *out = std::make_shared(fields); + return Status::OK(); } } // namespace parquet diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h index 0071681656b8c..61de193a33877 100644 --- a/cpp/src/arrow/parquet/schema.h +++ b/cpp/src/arrow/parquet/schema.h @@ -18,20 +18,24 @@ #ifndef ARROW_PARQUET_SCHEMA_H #define ARROW_PARQUET_SCHEMA_H -#include -#include -#include -#include - #include +#include "parquet/api/schema.h" + +#include "arrow/schema.h" +#include "arrow/type.h" + namespace arrow { +class Status; + namespace parquet { -std::shared_ptr NodeToField(const parquet_cpp::schema::NodePtr& node); -std::shared_ptr FromParquetSchema( - const parquet_cpp::SchemaDescriptor* parquet_schema); +Status NodeToField(const parquet_cpp::schema::NodePtr& node, + std::shared_ptr* out); + +Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema, + std::shared_ptr* out); } // namespace parquet From 54daa9b4987b78ae254b68d1471fcfa52e5f4197 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 26 Mar 2016 16:20:24 -0700 Subject: [PATCH 5/7] Refactor tests to invoke FromParquetSchema --- cpp/src/arrow/parquet/parquet-schema-test.cc | 150 +++++++++---------- 1 file changed, 75 insertions(+), 75 deletions(-) diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc index 4debb2964edb3..b83242166a467 100644 --- a/cpp/src/arrow/parquet/parquet-schema-test.cc +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -15,67 +15,103 @@ // specific language governing permissions and limitations // under the License. -#include "gtest/gtest.h" +#include +#include -#include "arrow/parquet/schema.h" +#include "gtest/gtest.h" #include "arrow/test-util.h" +#include "arrow/type.h" #include "arrow/util/status.h" +#include "arrow/parquet/schema.h" + namespace arrow { namespace parquet { using parquet_cpp::Repetition; using parquet_cpp::schema::NodePtr; +using parquet_cpp::schema::GroupNode; using parquet_cpp::schema::PrimitiveNode; -TEST(TestNodeConversion, Primitive) { +const auto BOOL = std::make_shared(); +const auto UINT8 = std::make_shared(); +const auto INT32 = std::make_shared(); +const auto INT64 = std::make_shared(); +const auto FLOAT = std::make_shared(); +const auto DOUBLE = std::make_shared(); +const auto UTF8 = std::make_shared(); +const auto BINARY = std::make_shared( + std::make_shared("", UINT8)); + +class TestConvertParquetSchema : public ::testing::Test { + public: + virtual void SetUp() {} + + void CheckFlatSchema(const std::vector& nodes, + const std::shared_ptr& expected_schema) { + NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); + + parquet_cpp::SchemaDescriptor descr; + descr.Init(schema); + + std::shared_ptr result_schema; + ASSERT_OK(FromParquetSchema(&descr, &result_schema)); + + ASSERT_EQ(expected_schema->num_fields(), result_schema->num_fields()); + for (size_t i = 0; i < nodes.size(); ++i) { + auto lhs = result_schema->field(i); + auto rhs = expected_schema->field(i); + EXPECT_TRUE(lhs->Equals(rhs)) + << i << " " << lhs->ToString() << " != " << rhs->ToString(); + } + } +}; + +TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { + std::vector parquet_fields; + std::vector> arrow_fields; + std::shared_ptr field; - NodePtr node = PrimitiveNode::Make("boolean", Repetition::REQUIRED, - parquet_cpp::Type::BOOLEAN); + parquet_fields.push_back( + PrimitiveNode::Make("boolean", Repetition::REQUIRED, parquet_cpp::Type::BOOLEAN)); + arrow_fields.push_back(std::make_shared("boolean", BOOL, false)); - ASSERT_OK(NodeToField(node, &field)); - ASSERT_EQ(field->name, "boolean"); - ASSERT_TRUE(field->type->Equals(std::make_shared())); - ASSERT_FALSE(field->nullable); + parquet_fields.push_back( + PrimitiveNode::Make("int32", Repetition::REQUIRED, parquet_cpp::Type::INT32)); + arrow_fields.push_back(std::make_shared("int32", INT32, false)); - node = PrimitiveNode::Make("int32", Repetition::REQUIRED, parquet_cpp::Type::INT32); - ASSERT_OK(NodeToField(node, &field)); - ASSERT_EQ(field->name, "int32"); - ASSERT_TRUE(field->type->Equals(std::make_shared())); - ASSERT_FALSE(field->nullable); + parquet_fields.push_back( + PrimitiveNode::Make("int64", Repetition::REQUIRED, parquet_cpp::Type::INT64)); + arrow_fields.push_back(std::make_shared("int64", INT64, false)); - node = PrimitiveNode::Make("int64", Repetition::REQUIRED, parquet_cpp::Type::INT64); - ASSERT_OK(NodeToField(node, &field)); - ASSERT_EQ(field->name, "int64"); - ASSERT_TRUE(field->type->Equals(std::make_shared())); - ASSERT_FALSE(field->nullable); + parquet_fields.push_back( + PrimitiveNode::Make("float", Repetition::OPTIONAL, parquet_cpp::Type::FLOAT)); + arrow_fields.push_back(std::make_shared("float", FLOAT)); - // case parquet_cpp::Type::INT96: - // TODO: Implement! - // node = PrimitiveNode::Make("int96", Repetition::REQUIRED, parquet_cpp::Type::INT96); - // field = NodeToField(node); - // TODO: Assertions + parquet_fields.push_back( + PrimitiveNode::Make("double", Repetition::OPTIONAL, parquet_cpp::Type::DOUBLE)); + arrow_fields.push_back(std::make_shared("double", DOUBLE)); + + parquet_fields.push_back( + PrimitiveNode::Make("binary", Repetition::OPTIONAL, + parquet_cpp::Type::BYTE_ARRAY)); + arrow_fields.push_back(std::make_shared("binary", BINARY)); - // case parquet_cpp::Type::FLOAT: - node = PrimitiveNode::Make("float", Repetition::REQUIRED, parquet_cpp::Type::FLOAT); - ASSERT_OK(NodeToField(node, &field)); - ASSERT_EQ(field->name, "float"); - ASSERT_TRUE(field->type->Equals(std::make_shared())); - ASSERT_FALSE(field->nullable); + parquet_fields.push_back( + PrimitiveNode::Make("string", Repetition::OPTIONAL, + parquet_cpp::Type::BYTE_ARRAY, + parquet_cpp::LogicalType::UTF8)); + arrow_fields.push_back(std::make_shared("string", UTF8)); - // case parquet_cpp::Type::DOUBLE: - node = PrimitiveNode::Make("double", Repetition::REQUIRED, parquet_cpp::Type::DOUBLE); - ASSERT_OK(NodeToField(node, &field)); - ASSERT_EQ(field->name, "double"); - ASSERT_TRUE(field->type->Equals(std::make_shared())); - ASSERT_FALSE(field->nullable); + auto arrow_schema = std::make_shared(arrow_fields); + CheckFlatSchema(parquet_fields, arrow_schema); + // case parquet_cpp::Type::INT96: // TODO: Implement! - // node = PrimitiveNode::Make("byte_array", Repetition::REQUIRED, - // parquet_cpp::Type::BYTE_ARRAY); + // node = PrimitiveNode::Make("int96", Repetition::REQUIRED, parquet_cpp::Type::INT96); // field = NodeToField(node); // TODO: Assertions @@ -86,43 +122,7 @@ TEST(TestNodeConversion, Primitive) { // TODO: Assertions } -const auto UINT8 = std::make_shared(); - -TEST(TestNodeConversion, Int96Timestamp) { -} - -TEST(TestNodeConversion, ByteArray) { - std::shared_ptr field; - - NodePtr node = PrimitiveNode::Make("field0", Repetition::OPTIONAL, - parquet_cpp::Type::BYTE_ARRAY); - ASSERT_OK(NodeToField(node, &field)); - - std::shared_ptr ex_type = std::make_shared( - std::make_shared("", UINT8)); - - ASSERT_EQ(field->name, "field0"); - ASSERT_TRUE(field->type->Equals(ex_type)); - ASSERT_TRUE(field->nullable); - - node = PrimitiveNode::Make("field1", Repetition::OPTIONAL, - parquet_cpp::Type::BYTE_ARRAY, - parquet_cpp::LogicalType::UTF8); - ASSERT_OK(NodeToField(node, &field)); - ex_type = std::make_shared(); - - ASSERT_EQ(field->name, "field1"); - ASSERT_TRUE(field->type->Equals(ex_type)); - ASSERT_TRUE(field->nullable); -} - -TEST(TestNodeConversion, FixedLenByteArray) { -} - -TEST(TestNodeConversion, Logical) { -} - -TEST(TestSchemaConversion, Basics) { +TEST(TestNodeConversion, DateAndTime) { } } // namespace parquet From e5c429ab024ec8989fad1ad091188a4d93a502dd Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 26 Mar 2016 16:48:23 -0700 Subject: [PATCH 6/7] Test for some unsupported Parquet schema types, add unannotated FIXED_LEN_BYTE_ARRAY to List --- cpp/src/arrow/parquet/parquet-schema-test.cc | 73 ++++++++++++-------- cpp/src/arrow/parquet/schema.cc | 48 ++++++++++--- cpp/src/arrow/util/status.h | 1 + 3 files changed, 83 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc index b83242166a467..9c3093d9ff7c9 100644 --- a/cpp/src/arrow/parquet/parquet-schema-test.cc +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -49,32 +49,31 @@ class TestConvertParquetSchema : public ::testing::Test { public: virtual void SetUp() {} - void CheckFlatSchema(const std::vector& nodes, - const std::shared_ptr& expected_schema) { - NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); - - parquet_cpp::SchemaDescriptor descr; - descr.Init(schema); - - std::shared_ptr result_schema; - ASSERT_OK(FromParquetSchema(&descr, &result_schema)); - - ASSERT_EQ(expected_schema->num_fields(), result_schema->num_fields()); - for (size_t i = 0; i < nodes.size(); ++i) { - auto lhs = result_schema->field(i); + void CheckFlatSchema(const std::shared_ptr& expected_schema) { + ASSERT_EQ(expected_schema->num_fields(), result_schema_->num_fields()); + for (int i = 0; i < expected_schema->num_fields(); ++i) { + auto lhs = result_schema_->field(i); auto rhs = expected_schema->field(i); EXPECT_TRUE(lhs->Equals(rhs)) << i << " " << lhs->ToString() << " != " << rhs->ToString(); } } + + Status ConvertSchema(const std::vector& nodes) { + NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); + descr_.Init(schema); + return FromParquetSchema(&descr_, &result_schema_); + } + + protected: + parquet_cpp::SchemaDescriptor descr_; + std::shared_ptr result_schema_; }; TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { std::vector parquet_fields; std::vector> arrow_fields; - std::shared_ptr field; - parquet_fields.push_back( PrimitiveNode::Make("boolean", Repetition::REQUIRED, parquet_cpp::Type::BOOLEAN)); arrow_fields.push_back(std::make_shared("boolean", BOOL, false)); @@ -106,20 +105,38 @@ TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { parquet_cpp::LogicalType::UTF8)); arrow_fields.push_back(std::make_shared("string", UTF8)); + parquet_fields.push_back( + PrimitiveNode::Make("flba-binary", Repetition::OPTIONAL, + parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY, + parquet_cpp::LogicalType::NONE, 12)); + arrow_fields.push_back(std::make_shared("flba-binary", BINARY)); + auto arrow_schema = std::make_shared(arrow_fields); - CheckFlatSchema(parquet_fields, arrow_schema); - - // case parquet_cpp::Type::INT96: - // TODO: Implement! - // node = PrimitiveNode::Make("int96", Repetition::REQUIRED, parquet_cpp::Type::INT96); - // field = NodeToField(node); - // TODO: Assertions - - // TODO: Implement! - // node = PrimitiveNode::Make("fixed_len_byte_array", Repetition::REQUIRED, - // parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY); - // field = NodeToField(node); - // TODO: Assertions + ASSERT_OK(ConvertSchema(parquet_fields)); + + CheckFlatSchema(arrow_schema); +} + +TEST_F(TestConvertParquetSchema, UnsupportedThings) { + std::vector unsupported_nodes; + + unsupported_nodes.push_back( + PrimitiveNode::Make("int96", Repetition::REQUIRED, parquet_cpp::Type::INT96)); + + unsupported_nodes.push_back( + GroupNode::Make("repeated-group", Repetition::REPEATED, {})); + + unsupported_nodes.push_back( + PrimitiveNode::Make("int32", Repetition::OPTIONAL, + parquet_cpp::Type::INT32, parquet_cpp::LogicalType::DATE)); + + unsupported_nodes.push_back( + PrimitiveNode::Make("int64", Repetition::OPTIONAL, + parquet_cpp::Type::INT64, parquet_cpp::LogicalType::TIMESTAMP_MILLIS)); + + for (const NodePtr& node : unsupported_nodes) { + ASSERT_RAISES(NotImplemented, ConvertSchema({node})); + } } TEST(TestNodeConversion, DateAndTime) { diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index 63f7e47bf73bc..6b1de572617b8 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -66,6 +66,9 @@ static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) { static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) { switch (node->logical_type()) { + case LogicalType::NONE: + *out = BINARY; + break; case LogicalType::DECIMAL: *out = MakeDecimalType(node); break; @@ -77,9 +80,37 @@ static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) { return Status::OK(); } +static Status FromInt32(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::NONE: + *out = INT32; + break; + default: + return Status::NotImplemented("Unhandled logical type for int32"); + break; + } + return Status::OK(); +} + +static Status FromInt64(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::NONE: + *out = INT64; + break; + default: + return Status::NotImplemented("Unhandled logical type for int64"); + break; + } + return Status::OK(); +} + // TODO: Logical Type Handling Status NodeToField(const NodePtr& node, std::shared_ptr* out) { - TypePtr type; + std::shared_ptr type; + + if (node->is_repeated()) { + return Status::NotImplemented("No support yet for repeated node types"); + } if (node->is_group()) { const GroupNode* group = static_cast(node.get()); @@ -97,10 +128,10 @@ Status NodeToField(const NodePtr& node, std::shared_ptr* out) { type = BOOL; break; case parquet_cpp::Type::INT32: - type = INT32; + RETURN_NOT_OK(FromInt32(primitive, &type)); break; case parquet_cpp::Type::INT64: - type = INT64; + RETURN_NOT_OK(FromInt64(primitive, &type)); break; case parquet_cpp::Type::INT96: // TODO: Do we have that type in Arrow? @@ -122,23 +153,18 @@ Status NodeToField(const NodePtr& node, std::shared_ptr* out) { } } - if (node->is_repeated()) { - type = TypePtr(new ListType(type)); - } - *out = std::make_shared(node->name(), type, !node->is_required()); - return Status::OK(); } Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema, std::shared_ptr* out) { - std::vector> fields; + // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes + // from the root Parquet node const GroupNode* schema_node = static_cast( parquet_schema->schema().get()); - // TODO: What to with the head node? - fields.resize(schema_node->field_count()); + std::vector> fields(schema_node->field_count()); for (int i = 0; i < schema_node->field_count(); i++) { RETURN_NOT_OK(NodeToField(schema_node->field(i), &fields[i])); } diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index b5931232dbdcb..4e273edcb8f1f 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -109,6 +109,7 @@ class Status { bool IsKeyError() const { return code() == StatusCode::KeyError; } bool IsInvalid() const { return code() == StatusCode::Invalid; } bool IsIOError() const { return code() == StatusCode::IOError; } + bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; } // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. From f38821085b6c153354261a2794f9964edd37737b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 26 Mar 2016 16:50:00 -0700 Subject: [PATCH 7/7] Correct typo in Layout.md (thanks @takahirox) --- format/Layout.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/format/Layout.md b/format/Layout.md index 2d46ece606ea7..1b532c6b3817c 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -58,7 +58,7 @@ Base requirements * Memory layout and random access patterns for each relative type * Null value representation -## Non-goals (for this document +## Non-goals (for this document) * To enumerate or specify logical types that can be implemented as primitive (fixed-width) value types. For example: signed and unsigned integers,