diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6d701079b482c..6ed2768d13918 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -378,6 +378,16 @@ function(ADD_ARROW_TEST_DEPENDENCIES REL_TEST_NAME) add_dependencies(${TEST_NAME} ${ARGN}) endfunction() +# A wrapper for target_link_libraries() that is compatible with NO_TESTS. +function(ARROW_TEST_LINK_LIBRARIES REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + target_link_libraries(${TEST_NAME} ${ARGN}) +endfunction() + enable_testing() ############################################################ @@ -528,6 +538,7 @@ set(ARROW_SRCS src/arrow/ipc/metadata-internal.cc src/arrow/types/construct.cc + src/arrow/types/decimal.cc src/arrow/types/json.cc src/arrow/types/list.cc src/arrow/types/primitive.cc diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index 7b449affab025..f9479900bb135 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -19,9 +19,12 @@ # arrow_parquet : Arrow <-> Parquet adapter set(PARQUET_SRCS + schema.cc ) set(PARQUET_LIBS + arrow + ${PARQUET_SHARED_LIB} ) add_library(arrow_parquet STATIC @@ -30,6 +33,9 @@ add_library(arrow_parquet STATIC target_link_libraries(arrow_parquet ${PARQUET_LIBS}) SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX) +ADD_ARROW_TEST(parquet-schema-test) +ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet) + # Headers: top level install(FILES DESTINATION include/arrow/parquet) diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc new file mode 100644 index 0000000000000..e562df9c57704 --- /dev/null +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include "arrow/parquet/schema.h" + +namespace arrow { + +namespace parquet { + +using parquet_cpp::Repetition; +using parquet_cpp::schema::NodePtr; +using parquet_cpp::schema::PrimitiveNode; + +TEST(TestNodeConversion, Primitive) { + NodePtr node = PrimitiveNode::Make("boolean", Repetition::REQUIRED, + parquet_cpp::Type::BOOLEAN); + std::shared_ptr field = NodeToField(node); + ASSERT_EQ(field->name, "boolean"); + ASSERT_TRUE(field->type->Equals(std::make_shared())); + ASSERT_FALSE(field->nullable); + + node = PrimitiveNode::Make("int32", Repetition::REQUIRED, parquet_cpp::Type::INT32); + field = NodeToField(node); + ASSERT_EQ(field->name, "int32"); + ASSERT_TRUE(field->type->Equals(std::make_shared())); + ASSERT_FALSE(field->nullable); + + node = PrimitiveNode::Make("int64", Repetition::REQUIRED, parquet_cpp::Type::INT64); + field = NodeToField(node); + ASSERT_EQ(field->name, "int64"); + ASSERT_TRUE(field->type->Equals(std::make_shared())); + ASSERT_FALSE(field->nullable); + + // case parquet_cpp::Type::INT96: + // TODO: Implement! + // node = PrimitiveNode::Make("int96", Repetition::REQUIRED, parquet_cpp::Type::INT96); + // field = NodeToField(node); + // TODO: Assertions + + // case parquet_cpp::Type::FLOAT: + node = PrimitiveNode::Make("float", Repetition::REQUIRED, parquet_cpp::Type::FLOAT); + field = NodeToField(node); + ASSERT_EQ(field->name, "float"); + ASSERT_TRUE(field->type->Equals(std::make_shared())); + ASSERT_FALSE(field->nullable); + + // case parquet_cpp::Type::DOUBLE: + node = PrimitiveNode::Make("double", Repetition::REQUIRED, parquet_cpp::Type::DOUBLE); + field = NodeToField(node); + ASSERT_EQ(field->name, "double"); + ASSERT_TRUE(field->type->Equals(std::make_shared())); + ASSERT_FALSE(field->nullable); + + // TODO: Implement! + // node = PrimitiveNode::Make("byte_array", Repetition::REQUIRED, + // parquet_cpp::Type::BYTE_ARRAY); + // field = NodeToField(node); + // TODO: Assertions + + // TODO: Implement! + // node = PrimitiveNode::Make("fixed_len_byte_array", Repetition::REQUIRED, + // parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY); + // field = NodeToField(node); + // TODO: Assertions +} + +TEST(TestNodeConversion, Logical) { +} + +TEST(TestSchemaConversion, Basics) { +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc new file mode 100644 index 0000000000000..3ac169d2fe1dc --- /dev/null +++ b/cpp/src/arrow/parquet/schema.cc @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/parquet/schema.h" +#include "arrow/types/decimal.h" + +using parquet_cpp::schema::Node; +using parquet_cpp::schema::NodePtr; +using parquet_cpp::schema::GroupNode; +using parquet_cpp::schema::PrimitiveNode; + +namespace arrow { + +namespace parquet { + + +TypePtr MakeDecimalType(const PrimitiveNode* node) { + int precision = node->decimal_metadata().precision; + int scale = node->decimal_metadata().scale; + return TypePtr(new DecimalType(precision, scale)); +} + +// TODO: Logical Type Handling +std::shared_ptr NodeToField(const NodePtr& node) { + TypePtr type; + + if (node->is_group()) { + const GroupNode* group = static_cast(node.get()); + std::vector> fields; + for (int i = 0; i < group->field_count(); i++) { + fields.push_back(NodeToField(group->field(i))); + } + type = TypePtr(new StructType(fields)); + } else { + // Primitive (leaf) node + const PrimitiveNode* primitive = static_cast(node.get()); + + switch (primitive->physical_type()) { + case parquet_cpp::Type::BOOLEAN: + type = TypePtr(new BooleanType()); + break; + case parquet_cpp::Type::INT32: + type = TypePtr(new Int32Type()); + break; + case parquet_cpp::Type::INT64: + type = TypePtr(new Int64Type()); + break; + case parquet_cpp::Type::INT96: + // TODO: Do we have that type in Arrow? + // type = TypePtr(new Int96Type()); + break; + case parquet_cpp::Type::FLOAT: + type = TypePtr(new FloatType()); + break; + case parquet_cpp::Type::DOUBLE: + type = TypePtr(new DoubleType()); + break; + case parquet_cpp::Type::BYTE_ARRAY: + // TODO: Do we have that type in Arrow? + // type = TypePtr(new Int96Type()); + break; + case parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY: + switch (primitive->logical_type()) { + case parquet_cpp::LogicalType::DECIMAL: + type = MakeDecimalType(primitive); + break; + default: + // TODO: Do we have that type in Arrow? + break; + } + break; + } + } + + if (node->is_repeated()) { + type = TypePtr(new ListType(type)); + } + + return std::shared_ptr(new Field(node->name(), type, !node->is_required())); +} + +std::shared_ptr FromParquetSchema( + const parquet_cpp::SchemaDescriptor* parquet_schema) { + std::vector> fields; + const GroupNode* schema_node = static_cast( + parquet_schema->schema().get()); + + // TODO: What to with the head node? + for (int i = 0; i < schema_node->field_count(); i++) { + fields.push_back(NodeToField(schema_node->field(i))); + } + + return std::shared_ptr(new Schema(fields)); +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h new file mode 100644 index 0000000000000..0071681656b8c --- /dev/null +++ b/cpp/src/arrow/parquet/schema.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PARQUET_SCHEMA_H +#define ARROW_PARQUET_SCHEMA_H + +#include +#include +#include +#include + +#include + +namespace arrow { + +namespace parquet { + +std::shared_ptr NodeToField(const parquet_cpp::schema::NodePtr& node); +std::shared_ptr FromParquetSchema( + const parquet_cpp::SchemaDescriptor* parquet_schema); + +} // namespace parquet + +} // namespace arrow + +#endif diff --git a/cpp/src/arrow/types/decimal.cc b/cpp/src/arrow/types/decimal.cc new file mode 100644 index 0000000000000..f120c1a9dfde6 --- /dev/null +++ b/cpp/src/arrow/types/decimal.cc @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/decimal.h" + +#include +#include + +namespace arrow { + +std::string DecimalType::ToString() const { + std::stringstream s; + s << "decimal(" << precision << ", " << scale << ")"; + return s.str(); +} + +} // namespace arrow + diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 464c3ff8da92b..26243b42b0e7d 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -18,13 +18,24 @@ #ifndef ARROW_TYPES_DECIMAL_H #define ARROW_TYPES_DECIMAL_H +#include + #include "arrow/type.h" namespace arrow { struct DecimalType : public DataType { + explicit DecimalType(int precision_, int scale_) + : DataType(Type::DECIMAL), precision(precision_), + scale(scale_) { } int precision; int scale; + + static char const *name() { + return "decimal"; + } + + std::string ToString() const override; }; } // namespace arrow