diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8042661533e1d..e8cb88c0b4d9b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -37,18 +37,17 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() -# Enable using a custom GCC toolchain to build Arrow -if (NOT "$ENV{ARROW_GCC_ROOT}" STREQUAL "") - set(GCC_ROOT $ENV{ARROW_GCC_ROOT}) - set(CMAKE_C_COMPILER ${GCC_ROOT}/bin/gcc) - set(CMAKE_CXX_COMPILER ${GCC_ROOT}/bin/g++) -endif() - if(APPLE) # In newer versions of CMake, this is the default setting set(CMAKE_MACOSX_RPATH 1) endif() +find_program(CCACHE_FOUND ccache) +if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) +endif(CCACHE_FOUND) + # ---------------------------------------------------------------------- # cmake options @@ -126,38 +125,16 @@ endif () # Add common flags set(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}") -# Required to avoid static linking errors with dependencies -add_definitions(-fPIC) - # Determine compiler version include(CompilerInfo) if ("${COMPILER_FAMILY}" STREQUAL "clang") - # Clang helpfully provides a few extensions from C++11 such as the 'override' - # keyword on methods. This doesn't change behavior, and we selectively enable - # it in src/gutil/port.h only on clang. So, we can safely use it, and don't want - # to trigger warnings when we do so. - # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++11-extensions") - # Using Clang with ccache causes a bunch of spurious warnings that are # purportedly fixed in the next version of ccache. See the following for details: # # http://petereisentraut.blogspot.com/2011/05/ccache-and-clang.html # http://petereisentraut.blogspot.com/2011/09/ccache-and-clang-part-2.html set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") - - # Only hardcode -fcolor-diagnostics if stderr is opened on a terminal. Otherwise - # the color codes show up as noisy artifacts. - # - # This test is imperfect because 'cmake' and 'make' can be run independently - # (with different terminal options), and we're testing during the former. - execute_process(COMMAND test -t 2 RESULT_VARIABLE ARROW_IS_TTY) - if ((${ARROW_IS_TTY} EQUAL 0) AND (NOT ("$ENV{TERM}" STREQUAL "dumb"))) - message("Running in a controlling terminal") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcolor-diagnostics") - else() - message("Running without a controlling terminal or in a dumb terminal") - endif() endif() # Sanity check linking option. @@ -278,12 +255,6 @@ set(LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") include_directories(src) -############################################################ -# Visibility -############################################################ -# For generate_export_header() and add_compiler_export_flags(). -include(GenerateExportHeader) - ############################################################ # Testing ############################################################ @@ -456,21 +427,32 @@ endif() # Subdirectories ############################################################ -add_subdirectory(src/arrow) -add_subdirectory(src/arrow/util) -add_subdirectory(src/arrow/table) -add_subdirectory(src/arrow/types) - -set(LINK_LIBS - arrow_util - arrow_table - arrow_types) +set(LIBARROW_LINK_LIBS +) set(ARROW_SRCS src/arrow/array.cc src/arrow/builder.cc - src/arrow/field.cc src/arrow/type.cc + + src/arrow/table/column.cc + src/arrow/table/schema.cc + src/arrow/table/table.cc + + src/arrow/types/construct.cc + src/arrow/types/floating.cc + src/arrow/types/integer.cc + src/arrow/types/json.cc + src/arrow/types/list.cc + src/arrow/types/primitive.cc + src/arrow/types/string.cc + src/arrow/types/struct.cc + src/arrow/types/union.cc + + src/arrow/util/bit-util.cc + src/arrow/util/buffer.cc + src/arrow/util/memory-pool.cc + src/arrow/util/status.cc ) set(LIBARROW_LINKAGE "SHARED") @@ -479,8 +461,15 @@ add_library(arrow ${LIBARROW_LINKAGE} ${ARROW_SRCS} ) -target_link_libraries(arrow ${LINK_LIBS}) -set_target_properties(arrow PROPERTIES LINKER_LANGUAGE CXX) +set_target_properties(arrow + PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") +target_link_libraries(arrow ${LIBARROW_LINK_LIBS}) + +add_subdirectory(src/arrow) +add_subdirectory(src/arrow/util) +add_subdirectory(src/arrow/table) +add_subdirectory(src/arrow/types) install(TARGETS arrow LIBRARY DESTINATION lib diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 102a8a1853f3e..77326ce38d754 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -20,7 +20,6 @@ install(FILES api.h array.h builder.h - field.h type.h DESTINATION include/arrow) diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 899e8aae19c0e..c73d4b386cf54 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -15,7 +15,28 @@ // specific language governing permissions and limitations // under the License. +// Coarse public API while the library is in development + #ifndef ARROW_API_H #define ARROW_API_H +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/type.h" + +#include "arrow/table/column.h" +#include "arrow/table/schema.h" +#include "arrow/table/table.h" + +#include "arrow/types/boolean.h" +#include "arrow/types/construct.h" +#include "arrow/types/floating.h" +#include "arrow/types/integer.h" +#include "arrow/types/list.h" +#include "arrow/types/string.h" +#include "arrow/types/struct.h" + +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + #endif // ARROW_API_H diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 491b9133d2cca..8cc689c3e81ee 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -32,7 +32,7 @@ class Array; class MemoryPool; class PoolBuffer; -static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 8; +static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 5; // Base class for all data array builders class ArrayBuilder { @@ -78,12 +78,16 @@ class ArrayBuilder { // Creates new array object to hold the contents of the builder and transfers // ownership of the data - virtual Status ToArray(Array** out) = 0; + virtual std::shared_ptr Finish() = 0; + + const std::shared_ptr& type() const { + return type_; + } protected: MemoryPool* pool_; - TypePtr type_; + std::shared_ptr type_; // When nulls are first appended to the builder, the null bitmap is allocated std::shared_ptr nulls_; diff --git a/cpp/src/arrow/field.h b/cpp/src/arrow/field.h deleted file mode 100644 index 89a450c66f256..0000000000000 --- a/cpp/src/arrow/field.h +++ /dev/null @@ -1,63 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_FIELD_H -#define ARROW_FIELD_H - -#include - -#include "arrow/type.h" - -namespace arrow { - -// A field is a piece of metadata that includes (for now) a name and a data -// type - -struct Field { - // Field name - std::string name; - - // The field's data type - TypePtr type; - - Field(const std::string& name, const TypePtr& type) : - name(name), - type(type) {} - - bool operator==(const Field& other) const { - return this->Equals(other); - } - - bool operator!=(const Field& other) const { - return !this->Equals(other); - } - - bool Equals(const Field& other) const { - return (this == &other) || (this->name == other.name && - this->type->Equals(other.type.get())); - } - - bool nullable() const { - return this->type->nullable; - } - - std::string ToString() const; -}; - -} // namespace arrow - -#endif // ARROW_FIELD_H diff --git a/cpp/src/arrow/table/CMakeLists.txt b/cpp/src/arrow/table/CMakeLists.txt index 68bf3148a9889..26d843d853bfb 100644 --- a/cpp/src/arrow/table/CMakeLists.txt +++ b/cpp/src/arrow/table/CMakeLists.txt @@ -19,21 +19,6 @@ # arrow_table ####################################### -set(TABLE_SRCS - column.cc - schema.cc - table.cc -) - -set(TABLE_LIBS -) - -add_library(arrow_table STATIC - ${TABLE_SRCS} -) -target_link_libraries(arrow_table ${TABLE_LIBS}) -SET_TARGET_PROPERTIES(arrow_table PROPERTIES LINKER_LANGUAGE CXX) - # Headers: top level install(FILES column.h diff --git a/cpp/src/arrow/table/column-test.cc b/cpp/src/arrow/table/column-test.cc index 4959b82c6e2ae..bf95932916cf4 100644 --- a/cpp/src/arrow/table/column-test.cc +++ b/cpp/src/arrow/table/column-test.cc @@ -21,7 +21,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" #include "arrow/table/test-common.h" diff --git a/cpp/src/arrow/table/column.cc b/cpp/src/arrow/table/column.cc index d68b491fb99da..573e650875944 100644 --- a/cpp/src/arrow/table/column.cc +++ b/cpp/src/arrow/table/column.cc @@ -20,7 +20,7 @@ #include #include -#include "arrow/field.h" +#include "arrow/type.h" #include "arrow/util/status.h" namespace arrow { diff --git a/cpp/src/arrow/table/column.h b/cpp/src/arrow/table/column.h index 64423bf956147..dfc7516e26aac 100644 --- a/cpp/src/arrow/table/column.h +++ b/cpp/src/arrow/table/column.h @@ -23,7 +23,7 @@ #include #include "arrow/array.h" -#include "arrow/field.h" +#include "arrow/type.h" namespace arrow { diff --git a/cpp/src/arrow/table/schema-test.cc b/cpp/src/arrow/table/schema-test.cc index 0cf1b3c5f9a8e..d6725cc08c0c8 100644 --- a/cpp/src/arrow/table/schema-test.cc +++ b/cpp/src/arrow/table/schema-test.cc @@ -20,7 +20,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/schema.h" #include "arrow/type.h" #include "arrow/types/string.h" @@ -97,10 +96,10 @@ TEST_F(TestSchema, ToString) { auto schema = std::make_shared(fields); std::string result = schema->ToString(); - std::string expected = R"(f0 ?int32 -f1 uint8 -f2 ?string -f3 ?list + std::string expected = R"(f0 int32 +f1 uint8 not null +f2 string +f3 list )"; ASSERT_EQ(expected, result); diff --git a/cpp/src/arrow/table/schema.cc b/cpp/src/arrow/table/schema.cc index fb3b4d6f29268..d49d0a713e7f4 100644 --- a/cpp/src/arrow/table/schema.cc +++ b/cpp/src/arrow/table/schema.cc @@ -22,7 +22,7 @@ #include #include -#include "arrow/field.h" +#include "arrow/type.h" namespace arrow { diff --git a/cpp/src/arrow/table/schema.h b/cpp/src/arrow/table/schema.h index d04e3f628c1e3..103f01b26e3ca 100644 --- a/cpp/src/arrow/table/schema.h +++ b/cpp/src/arrow/table/schema.h @@ -22,7 +22,6 @@ #include #include -#include "arrow/field.h" #include "arrow/type.h" namespace arrow { diff --git a/cpp/src/arrow/table/table-test.cc b/cpp/src/arrow/table/table-test.cc index dd4f74cd16f89..c4fdb062db83a 100644 --- a/cpp/src/arrow/table/table-test.cc +++ b/cpp/src/arrow/table/table-test.cc @@ -21,7 +21,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" #include "arrow/table/table.h" diff --git a/cpp/src/arrow/table/table.cc b/cpp/src/arrow/table/table.cc index 4cefc924ed38f..0c788b8fe3ff3 100644 --- a/cpp/src/arrow/table/table.cc +++ b/cpp/src/arrow/table/table.cc @@ -20,9 +20,9 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" +#include "arrow/type.h" #include "arrow/util/status.h" namespace arrow { diff --git a/cpp/src/arrow/table/test-common.h b/cpp/src/arrow/table/test-common.h index efe2f228cd0a3..50a5f6a2f5018 100644 --- a/cpp/src/arrow/table/test-common.h +++ b/cpp/src/arrow/table/test-common.h @@ -21,7 +21,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" #include "arrow/table/table.h" diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index ff145e2c1e3b4..265770822ce90 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -17,8 +17,56 @@ #include "arrow/type.h" +#include +#include + namespace arrow { +std::string Field::ToString() const { + std::stringstream ss; + ss << this->name << " " << this->type->ToString(); + return ss.str(); +} + +DataType::~DataType() {} + +StringType::StringType(bool nullable) + : DataType(LogicalType::STRING, nullable) {} + +StringType::StringType(const StringType& other) + : StringType(other.nullable) {} + +std::string StringType::ToString() const { + std::string result(name()); + if (!nullable) { + result.append(" not null"); + } + return result; +} + +std::string ListType::ToString() const { + std::stringstream s; + s << "list<" << value_type->ToString() << ">"; + if (!this->nullable) { + s << " not null"; + } + return s.str(); +} + +std::string StructType::ToString() const { + std::stringstream s; + s << "struct<"; + for (size_t i = 0; i < fields_.size(); ++i) { + if (i > 0) s << ", "; + const std::shared_ptr& field = fields_[i]; + s << field->name << ": " << field->type->ToString(); + } + s << ">"; + if (!nullable) s << " not null"; + return s.str(); +} + +const std::shared_ptr NA = std::make_shared(); const std::shared_ptr BOOL = std::make_shared(); const std::shared_ptr UINT8 = std::make_shared(); const std::shared_ptr UINT16 = std::make_shared(); @@ -30,5 +78,6 @@ const std::shared_ptr INT32 = std::make_shared(); const std::shared_ptr INT64 = std::make_shared(); const std::shared_ptr FLOAT = std::make_shared(); const std::shared_ptr DOUBLE = std::make_shared(); +const std::shared_ptr STRING = std::make_shared(); } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4193a0e8bc851..e78e49491193e 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -20,6 +20,7 @@ #include #include +#include namespace arrow { @@ -71,49 +72,46 @@ struct LogicalType { UINT64 = 7, INT64 = 8, - // A boolean value represented as 1 byte - BOOL = 9, - // A boolean value represented as 1 bit - BIT = 10, + BOOL = 9, // 4-byte floating point value - FLOAT = 11, + FLOAT = 10, // 8-byte floating point value - DOUBLE = 12, + DOUBLE = 11, // CHAR(N): fixed-length UTF8 string with length N - CHAR = 13, + CHAR = 12, // UTF8 variable-length string as List - STRING = 14, + STRING = 13, // VARCHAR(N): Null-terminated string type embedded in a CHAR(N + 1) - VARCHAR = 15, + VARCHAR = 14, // Variable-length bytes (no guarantee of UTF8-ness) - BINARY = 16, + BINARY = 15, // By default, int32 days since the UNIX epoch - DATE = 17, + DATE = 16, // Exact timestamp encoded with int64 since UNIX epoch // Default unit millisecond - TIMESTAMP = 18, + TIMESTAMP = 17, // Timestamp as double seconds since the UNIX epoch - TIMESTAMP_DOUBLE = 19, + TIMESTAMP_DOUBLE = 18, // Exact time encoded with int64, default unit millisecond - TIME = 20, + TIME = 19, // Precision- and scale-based decimal type. Storage type depends on the // parameters. - DECIMAL = 21, + DECIMAL = 20, // Decimal value encoded as a text string - DECIMAL_TEXT = 22, + DECIMAL_TEXT = 21, // A list of some logical data type LIST = 30, @@ -141,7 +139,9 @@ struct DataType { type(type), nullable(nullable) {} - virtual bool Equals(const DataType* other) { + virtual ~DataType(); + + bool Equals(const DataType* other) { // Call with a pointer so more friendly to subclasses return this == other || (this->type == other->type && this->nullable == other->nullable); @@ -154,10 +154,45 @@ struct DataType { virtual std::string ToString() const = 0; }; - typedef std::shared_ptr LayoutPtr; typedef std::shared_ptr TypePtr; +// A field is a piece of metadata that includes (for now) a name and a data +// type +struct Field { + // Field name + std::string name; + + // The field's data type + TypePtr type; + + Field(const std::string& name, const TypePtr& type) : + name(name), + type(type) {} + + bool operator==(const Field& other) const { + return this->Equals(other); + } + + bool operator!=(const Field& other) const { + return !this->Equals(other); + } + + bool Equals(const Field& other) const { + return (this == &other) || (this->name == other.name && + this->type->Equals(other.type.get())); + } + + bool Equals(const std::shared_ptr& other) const { + return Equals(*other.get()); + } + + bool nullable() const { + return this->type->nullable; + } + + std::string ToString() const; +}; struct BytesType : public LayoutType { int size; @@ -183,16 +218,18 @@ struct PrimitiveType : public DataType { explicit PrimitiveType(bool nullable = true) : DataType(Derived::type_enum, nullable) {} - virtual std::string ToString() const { - std::string result; - if (nullable) { - result.append("?"); - } - result.append(static_cast(this)->name()); - return result; - } + std::string ToString() const override; }; +template +inline std::string PrimitiveType::ToString() const { + std::string result(static_cast(this)->name()); + if (!nullable) { + result.append(" not null"); + } + return result; +} + #define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ typedef C_TYPE c_type; \ static constexpr LogicalType::type type_enum = LogicalType::ENUM; \ @@ -205,6 +242,10 @@ struct PrimitiveType : public DataType { return NAME; \ } +struct NullType : public PrimitiveType { + PRIMITIVE_DECL(NullType, void, NA, 0, "null"); +}; + struct BooleanType : public PrimitiveType { PRIMITIVE_DECL(BooleanType, uint8_t, BOOL, 1, "bool"); }; @@ -249,6 +290,55 @@ struct DoubleType : public PrimitiveType { PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double"); }; +struct ListType : public DataType { + // List can contain any other logical value type + TypePtr value_type; + + explicit ListType(const TypePtr& value_type, bool nullable = true) + : DataType(LogicalType::LIST, nullable), + value_type(value_type) {} + + static char const *name() { + return "list"; + } + + std::string ToString() const override; +}; + +// String is a logical type consisting of a physical list of 1-byte values +struct StringType : public DataType { + explicit StringType(bool nullable = true); + + StringType(const StringType& other); + + static char const *name() { + return "string"; + } + + std::string ToString() const override; +}; + +struct StructType : public DataType { + std::vector > fields_; + + explicit StructType(const std::vector >& fields, + bool nullable = true) + : DataType(LogicalType::STRUCT, nullable) { + fields_ = fields; + } + + const std::shared_ptr& field(int i) const { + return fields_[i]; + } + + int num_children() const { + return fields_.size(); + } + + std::string ToString() const override; +}; + +extern const std::shared_ptr NA; extern const std::shared_ptr BOOL; extern const std::shared_ptr UINT8; extern const std::shared_ptr UINT16; @@ -260,6 +350,7 @@ extern const std::shared_ptr INT32; extern const std::shared_ptr INT64; extern const std::shared_ptr FLOAT; extern const std::shared_ptr DOUBLE; +extern const std::shared_ptr STRING; } // namespace arrow diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index e090aead1f8b9..57cabdefd2525 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -19,31 +19,11 @@ # arrow_types ####################################### -set(TYPES_SRCS - construct.cc - floating.cc - integer.cc - json.cc - list.cc - primitive.cc - string.cc - struct.cc - union.cc -) - -set(TYPES_LIBS -) - -add_library(arrow_types STATIC - ${TYPES_SRCS} -) -target_link_libraries(arrow_types ${TYPES_LIBS}) -SET_TARGET_PROPERTIES(arrow_types PROPERTIES LINKER_LANGUAGE CXX) - # Headers: top level install(FILES boolean.h collection.h + construct.h datetime.h decimal.h floating.h diff --git a/cpp/src/arrow/types/boolean.h b/cpp/src/arrow/types/boolean.h index 8fc9cfd19c0d4..a5023d7b368d2 100644 --- a/cpp/src/arrow/types/boolean.h +++ b/cpp/src/arrow/types/boolean.h @@ -24,7 +24,8 @@ namespace arrow { typedef PrimitiveArrayImpl BooleanArray; -// typedef PrimitiveBuilder BooleanBuilder; +class BooleanBuilder : public ArrayBuilder { +}; } // namespace arrow diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 05d6b270fc3fd..43f01a3051385 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -32,13 +32,13 @@ class ArrayBuilder; // Initially looked at doing this with vtables, but shared pointers makes it // difficult -#define BUILDER_CASE(ENUM, BuilderType) \ - case LogicalType::ENUM: \ - *out = static_cast(new BuilderType(pool, type)); \ +#define BUILDER_CASE(ENUM, BuilderType) \ + case LogicalType::ENUM: \ + out->reset(new BuilderType(pool, type)); \ return Status::OK(); -Status make_builder(MemoryPool* pool, const TypePtr& type, - ArrayBuilder** out) { +Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, + std::shared_ptr* out) { switch (type->type) { BUILDER_CASE(UINT8, UInt8Builder); BUILDER_CASE(INT8, Int8Builder); @@ -58,13 +58,12 @@ Status make_builder(MemoryPool* pool, const TypePtr& type, case LogicalType::LIST: { - ListType* list_type = static_cast(type.get()); - ArrayBuilder* value_builder; - RETURN_NOT_OK(make_builder(pool, list_type->value_type, &value_builder)); + std::shared_ptr value_builder; - // The ListBuilder takes ownership of the value_builder - ListBuilder* builder = new ListBuilder(pool, type, value_builder); - *out = static_cast(builder); + const std::shared_ptr& value_type = static_cast( + type.get())->value_type; + RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); + out->reset(new ListBuilder(pool, type, value_builder)); return Status::OK(); } // BUILDER_CASE(CHAR, CharBuilder); diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index b5ba436f787d9..59ebe1acddc98 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -18,6 +18,8 @@ #ifndef ARROW_TYPES_CONSTRUCT_H #define ARROW_TYPES_CONSTRUCT_H +#include + #include "arrow/type.h" namespace arrow { @@ -26,8 +28,8 @@ class ArrayBuilder; class MemoryPool; class Status; -Status make_builder(MemoryPool* pool, const TypePtr& type, - ArrayBuilder** out); +Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, + std::shared_ptr* out); } // namespace arrow diff --git a/cpp/src/arrow/types/json.cc b/cpp/src/arrow/types/json.cc index b29b95715fef6..168e370d51a14 100644 --- a/cpp/src/arrow/types/json.cc +++ b/cpp/src/arrow/types/json.cc @@ -19,10 +19,7 @@ #include -#include "arrow/types/boolean.h" -#include "arrow/types/integer.h" -#include "arrow/types/floating.h" -#include "arrow/types/null.h" +#include "arrow/type.h" #include "arrow/types/string.h" #include "arrow/types/union.h" diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index b4bbd2841a89d..02991de2648e7 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -32,6 +32,7 @@ #include "arrow/types/test-common.h" #include "arrow/util/status.h" +using std::shared_ptr; using std::string; using std::unique_ptr; using std::vector; @@ -47,17 +48,18 @@ TEST(TypesTest, TestListType) { ASSERT_EQ(list_type.type, LogicalType::LIST); ASSERT_EQ(list_type.name(), string("list")); - ASSERT_EQ(list_type.ToString(), string("?list")); + ASSERT_EQ(list_type.ToString(), string("list")); ASSERT_EQ(list_type.value_type->type, vt->type); ASSERT_EQ(list_type.value_type->type, vt->type); std::shared_ptr st = std::make_shared(false); std::shared_ptr lt = std::make_shared(st, false); - ASSERT_EQ(lt->ToString(), string("list")); + ASSERT_EQ(lt->ToString(), string("list not null")); ListType lt2(lt, false); - ASSERT_EQ(lt2.ToString(), string("list>")); + ASSERT_EQ(lt2.ToString(), + string("list not null> not null")); } // ---------------------------------------------------------------------- @@ -71,23 +73,21 @@ class TestListBuilder : public TestBuilder { value_type_ = TypePtr(new Int32Type()); type_ = TypePtr(new ListType(value_type_)); - ArrayBuilder* tmp; - ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_.reset(static_cast(tmp)); + std::shared_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_ = std::dynamic_pointer_cast(tmp); } void Done() { - Array* out; - ASSERT_OK(builder_->ToArray(&out)); - result_.reset(static_cast(out)); + result_ = std::dynamic_pointer_cast(builder_->Finish()); } protected: TypePtr value_type_; TypePtr type_; - unique_ptr builder_; - unique_ptr result_; + shared_ptr builder_; + shared_ptr result_; }; @@ -116,7 +116,7 @@ TEST_F(TestListBuilder, TestBasics) { vector lengths = {3, 0, 4}; vector is_null = {0, 1, 0}; - Int32Builder* vb = static_cast(builder_->value_builder()); + Int32Builder* vb = static_cast(builder_->value_builder().get()); int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 577d71d0b2892..69a79a77fabe0 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -17,18 +17,6 @@ #include "arrow/types/list.h" -#include -#include - namespace arrow { -std::string ListType::ToString() const { - std::stringstream s; - if (this->nullable) { - s << "?"; - } - s << "list<" << value_type->ToString() << ">"; - return s.str(); -} - } // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index f39fe5c4d811b..f40a8245362b1 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -36,21 +36,6 @@ namespace arrow { class MemoryPool; -struct ListType : public DataType { - // List can contain any other logical value type - TypePtr value_type; - - explicit ListType(const TypePtr& value_type, bool nullable = true) - : DataType(LogicalType::LIST, nullable), - value_type(value_type) {} - - static char const *name() { - return "list"; - } - - virtual std::string ToString() const; -}; - class ListArray : public Array { public: ListArray() : Array(), offset_buf_(nullptr), offsets_(nullptr) {} @@ -106,10 +91,9 @@ class ListArray : public Array { class ListBuilder : public Int32Builder { public: ListBuilder(MemoryPool* pool, const TypePtr& type, - ArrayBuilder* value_builder) - : Int32Builder(pool, type) { - value_builder_.reset(value_builder); - } + std::shared_ptr value_builder) + : Int32Builder(pool, type), + value_builder_(value_builder) {} Status Init(int32_t elements) { // One more than requested. @@ -147,30 +131,27 @@ class ListBuilder : public Int32Builder { return Status::OK(); } - // Initialize an array type instance with the results of this builder - // Transfers ownership of all buffers template - Status Transfer(Container* out) { - Array* child_values; - RETURN_NOT_OK(value_builder_->ToArray(&child_values)); + std::shared_ptr Transfer() { + auto result = std::make_shared(); + + std::shared_ptr items = value_builder_->Finish(); // Add final offset if the length is non-zero if (length_) { - raw_buffer()[length_] = child_values->length(); + raw_buffer()[length_] = items->length(); } - out->Init(type_, length_, values_, ArrayPtr(child_values), + result->Init(type_, length_, values_, items, null_count_, nulls_); values_ = nulls_ = nullptr; capacity_ = length_ = null_count_ = 0; - return Status::OK(); + + return result; } - virtual Status ToArray(Array** out) { - ListArray* result = new ListArray(); - RETURN_NOT_OK(Transfer(result)); - *out = static_cast(result); - return Status::OK(); + std::shared_ptr Finish() override { + return Transfer(); } // Start a new variable-length list slot @@ -198,10 +179,12 @@ class ListBuilder : public Int32Builder { return Append(true); } - ArrayBuilder* value_builder() const { return value_builder_.get();} + const std::shared_ptr& value_builder() const { + return value_builder_; + } protected: - std::unique_ptr value_builder_; + std::shared_ptr value_builder_; }; diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 02eaaa7542bf0..f35a258e2cb57 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -37,6 +37,7 @@ #include "arrow/util/status.h" using std::string; +using std::shared_ptr; using std::unique_ptr; using std::vector; @@ -98,12 +99,12 @@ class TestPrimitiveBuilder : public TestBuilder { type_ = Attrs::type(); - ArrayBuilder* tmp; - ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_.reset(static_cast(tmp)); + std::shared_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_ = std::dynamic_pointer_cast(tmp); - ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_nn_.reset(static_cast(tmp)); + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_nn_ = std::dynamic_pointer_cast(tmp); } void RandomData(int N, double pct_null = 0.1) { @@ -112,7 +113,6 @@ class TestPrimitiveBuilder : public TestBuilder { } void CheckNullable() { - ArrayType result; ArrayType expected; int size = builder_->length(); @@ -125,7 +125,9 @@ class TestPrimitiveBuilder : public TestBuilder { int32_t ex_null_count = null_count(nulls_); expected.Init(size, ex_data, ex_null_count, ex_nulls); - ASSERT_OK(builder_->Transfer(&result)); + + std::shared_ptr result = std::dynamic_pointer_cast( + builder_->Finish()); // Builder is now reset ASSERT_EQ(0, builder_->length()); @@ -133,12 +135,11 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(0, builder_->null_count()); ASSERT_EQ(nullptr, builder_->buffer()); - ASSERT_TRUE(result.Equals(expected)); - ASSERT_EQ(ex_null_count, result.null_count()); + ASSERT_TRUE(result->Equals(expected)); + ASSERT_EQ(ex_null_count, result->null_count()); } void CheckNonNullable() { - ArrayType result; ArrayType expected; int size = builder_nn_->length(); @@ -146,22 +147,24 @@ class TestPrimitiveBuilder : public TestBuilder { size * sizeof(T)); expected.Init(size, ex_data); - ASSERT_OK(builder_nn_->Transfer(&result)); + + std::shared_ptr result = std::dynamic_pointer_cast( + builder_nn_->Finish()); // Builder is now reset ASSERT_EQ(0, builder_nn_->length()); ASSERT_EQ(0, builder_nn_->capacity()); ASSERT_EQ(nullptr, builder_nn_->buffer()); - ASSERT_TRUE(result.Equals(expected)); - ASSERT_EQ(0, result.null_count()); + ASSERT_TRUE(result->Equals(expected)); + ASSERT_EQ(0, result->null_count()); } protected: TypePtr type_; TypePtr type_nn_; - unique_ptr builder_; - unique_ptr builder_nn_; + shared_ptr builder_; + shared_ptr builder_nn_; vector draws_; vector nulls_; @@ -225,15 +228,36 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { ASSERT_OK(this->builder_->AppendNull()); } - Array* result; - ASSERT_OK(this->builder_->ToArray(&result)); - unique_ptr holder(result); + auto result = this->builder_->Finish(); for (int i = 0; i < size; ++i) { ASSERT_TRUE(result->IsNull(i)); } } +TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { + DECL_T(); + + int size = 10000; + + vector& draws = this->draws_; + vector& nulls = this->nulls_; + + int64_t memory_before = this->pool_->bytes_allocated(); + + this->RandomData(size); + + int i; + for (i = 0; i < size; ++i) { + ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + } + + do { + std::shared_ptr result = this->builder_->Finish(); + } while (false); + + ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); +} TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { DECL_T(); @@ -331,11 +355,11 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { } TYPED_TEST(TestPrimitiveBuilder, TestReserve) { - int n = 100; - ASSERT_OK(this->builder_->Reserve(n)); + ASSERT_OK(this->builder_->Reserve(10)); ASSERT_EQ(0, this->builder_->length()); ASSERT_EQ(MIN_BUILDER_CAPACITY, this->builder_->capacity()); + ASSERT_OK(this->builder_->Reserve(90)); ASSERT_OK(this->builder_->Advance(100)); ASSERT_OK(this->builder_->Reserve(MIN_BUILDER_CAPACITY)); diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 09d43e7ec8b80..1073bb6e1c340 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -64,6 +64,8 @@ class PrimitiveArrayImpl : public PrimitiveArray { PrimitiveArrayImpl() : PrimitiveArray() {} + virtual ~PrimitiveArrayImpl() {} + PrimitiveArrayImpl(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { @@ -197,24 +199,12 @@ class PrimitiveBuilder : public ArrayBuilder { return Status::OK(); } - // Initialize an array type instance with the results of this builder - // Transfers ownership of all buffers - Status Transfer(PrimitiveArray* out) { - out->Init(type_, length_, values_, null_count_, nulls_); + std::shared_ptr Finish() override { + std::shared_ptr result = std::make_shared(); + result->PrimitiveArray::Init(type_, length_, values_, null_count_, nulls_); values_ = nulls_ = nullptr; capacity_ = length_ = null_count_ = 0; - return Status::OK(); - } - - Status Transfer(ArrayType* out) { - return Transfer(static_cast(out)); - } - - virtual Status ToArray(Array** out) { - ArrayType* result = new ArrayType(); - RETURN_NOT_OK(Transfer(result)); - *out = static_cast(result); - return Status::OK(); + return result; } value_type* raw_buffer() { diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 9af667295026b..8e82fd95dd808 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -166,23 +166,18 @@ class TestStringBuilder : public TestBuilder { void SetUp() { TestBuilder::SetUp(); type_ = TypePtr(new StringType()); - - ArrayBuilder* tmp; - ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_.reset(static_cast(tmp)); + builder_.reset(new StringBuilder(pool_, type_)); } void Done() { - Array* out; - ASSERT_OK(builder_->ToArray(&out)); - result_.reset(static_cast(out)); + result_ = std::dynamic_pointer_cast(builder_->Finish()); } protected: TypePtr type_; std::unique_ptr builder_; - std::unique_ptr result_; + std::shared_ptr result_; }; TEST_F(TestStringBuilder, TestScalarAppend) { diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 5795cfed577c5..8ccc0a9698a54 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -71,28 +71,6 @@ struct VarcharType : public DataType { static const LayoutPtr byte1(new BytesType(1)); static const LayoutPtr physical_string = LayoutPtr(new ListLayoutType(byte1)); -// String is a logical type consisting of a physical list of 1-byte values -struct StringType : public DataType { - explicit StringType(bool nullable = true) - : DataType(LogicalType::STRING, nullable) {} - - StringType(const StringType& other) - : StringType() {} - - static char const *name() { - return "string"; - } - - virtual std::string ToString() const { - std::string result; - if (nullable) { - result.append("?"); - } - result.append(name()); - return result; - } -}; - // TODO: add a BinaryArray layer in between class StringArray : public ListArray { public: @@ -153,26 +131,23 @@ class StringArray : public ListArray { class StringBuilder : public ListBuilder { public: explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : - ListBuilder(pool, type, - static_cast(new UInt8Builder(pool, value_type_))) { + ListBuilder(pool, type, std::make_shared(pool, value_type_)) { byte_builder_ = static_cast(value_builder_.get()); } Status Append(const std::string& value) { - RETURN_NOT_OK(ListBuilder::Append()); - return byte_builder_->Append(reinterpret_cast(value.c_str()), - value.size()); + return Append(value.c_str(), value.size()); } - Status Append(const uint8_t* value, int32_t length); + Status Append(const char* value, int32_t length) { + RETURN_NOT_OK(ListBuilder::Append()); + return byte_builder_->Append(reinterpret_cast(value), length); + } Status Append(const std::vector& values, uint8_t* null_bytes); - virtual Status ToArray(Array** out) { - StringArray* result = new StringArray(); - RETURN_NOT_OK(ListBuilder::Transfer(result)); - *out = static_cast(result); - return Status::OK(); + std::shared_ptr Finish() override { + return ListBuilder::Transfer(); } protected: diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index df6157104795e..9a4777e8b983d 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -17,15 +17,16 @@ #include +#include #include #include -#include "arrow/field.h" #include "arrow/type.h" #include "arrow/types/integer.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +using std::shared_ptr; using std::string; using std::vector; @@ -33,23 +34,23 @@ namespace arrow { TEST(TestStructType, Basics) { TypePtr f0_type = TypePtr(new Int32Type()); - Field f0("f0", f0_type); + auto f0 = std::make_shared("f0", f0_type); TypePtr f1_type = TypePtr(new StringType()); - Field f1("f1", f1_type); + auto f1 = std::make_shared("f1", f1_type); TypePtr f2_type = TypePtr(new UInt8Type()); - Field f2("f2", f2_type); + auto f2 = std::make_shared("f2", f2_type); - vector fields = {f0, f1, f2}; + vector > fields = {f0, f1, f2}; StructType struct_type(fields); - ASSERT_TRUE(struct_type.field(0).Equals(f0)); - ASSERT_TRUE(struct_type.field(1).Equals(f1)); - ASSERT_TRUE(struct_type.field(2).Equals(f2)); + ASSERT_TRUE(struct_type.field(0)->Equals(f0)); + ASSERT_TRUE(struct_type.field(1)->Equals(f1)); + ASSERT_TRUE(struct_type.field(2)->Equals(f2)); - ASSERT_EQ(struct_type.ToString(), "?struct"); + ASSERT_EQ(struct_type.ToString(), "struct"); // TODO: out of bounds for field(...) } diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 6b233bc372af1..02af600b017d0 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -17,24 +17,6 @@ #include "arrow/types/struct.h" -#include -#include -#include -#include - namespace arrow { -std::string StructType::ToString() const { - std::stringstream s; - if (nullable) s << "?"; - s << "struct<"; - for (size_t i = 0; i < fields_.size(); ++i) { - if (i > 0) s << ", "; - const Field& field = fields_[i]; - s << field.name << ": " << field.type->ToString(); - } - s << ">"; - return s.str(); -} - } // namespace arrow diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index e575c31287cb2..5842534d35be1 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -18,33 +18,14 @@ #ifndef ARROW_TYPES_STRUCT_H #define ARROW_TYPES_STRUCT_H +#include #include #include -#include "arrow/field.h" #include "arrow/type.h" namespace arrow { -struct StructType : public DataType { - std::vector fields_; - - explicit StructType(const std::vector& fields, bool nullable = true) - : DataType(LogicalType::STRUCT, nullable) { - fields_ = fields; - } - - const Field& field(int i) const { - return fields_[i]; - } - - int num_children() const { - return fields_.size(); - } - - virtual std::string ToString() const; -}; - } // namespace arrow #endif // ARROW_TYPES_STRUCT_H diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index c53f307c9f59a..4272ce4285482 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -19,22 +19,6 @@ # arrow_util ####################################### -set(UTIL_SRCS - bit-util.cc - buffer.cc - memory-pool.cc - status.cc -) - -set(UTIL_LIBS -) - -add_library(arrow_util STATIC - ${UTIL_SRCS} -) -target_link_libraries(arrow_util ${UTIL_LIBS}) -SET_TARGET_PROPERTIES(arrow_util PROPERTIES LINKER_LANGUAGE CXX) - # Headers: top level install(FILES bit-util.h @@ -50,7 +34,7 @@ install(FILES add_library(arrow_test_util) target_link_libraries(arrow_test_util - arrow_util) +) SET_TARGET_PROPERTIES(arrow_test_util PROPERTIES LINKER_LANGUAGE CXX) @@ -64,7 +48,6 @@ add_library(arrow_test_main if (APPLE) target_link_libraries(arrow_test_main gtest - arrow_util arrow_test_util dl) set_target_properties(arrow_test_main @@ -72,7 +55,6 @@ if (APPLE) else() target_link_libraries(arrow_test_main gtest - arrow_util arrow_test_util pthread dl diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index 3f3807d4e2094..50f4716769d70 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -31,6 +31,8 @@ Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, parent_ = parent; } +Buffer::~Buffer() {} + std::shared_ptr MutableBuffer::GetImmutableView() { return std::make_shared(this->get_shared_ptr(), 0, size()); } @@ -43,6 +45,12 @@ PoolBuffer::PoolBuffer(MemoryPool* pool) : pool_ = pool; } +PoolBuffer::~PoolBuffer() { + if (mutable_data_ != nullptr) { + pool_->Free(mutable_data_, capacity_); + } +} + Status PoolBuffer::Reserve(int64_t new_capacity) { if (!mutable_data_ || new_capacity > capacity_) { uint8_t* new_data; diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 8704723eb0a89..0c3e210abd910 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -39,6 +39,7 @@ class Buffer : public std::enable_shared_from_this { Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} + virtual ~Buffer(); // An offset into data that is owned by another buffer, but we want to be // able to retain a valid pointer to it even after other shared_ptr's to the @@ -136,6 +137,7 @@ class ResizableBuffer : public MutableBuffer { class PoolBuffer : public ResizableBuffer { public: explicit PoolBuffer(MemoryPool* pool = nullptr); + virtual ~PoolBuffer(); virtual Status Resize(int64_t new_size); virtual Status Reserve(int64_t new_capacity); diff --git a/cpp/src/arrow/util/status.cc b/cpp/src/arrow/util/status.cc index c64b8a3d5f80a..c6e113ebea590 100644 --- a/cpp/src/arrow/util/status.cc +++ b/cpp/src/arrow/util/status.cc @@ -35,4 +35,44 @@ const char* Status::CopyState(const char* state) { return result; } +std::string Status::CodeAsString() const { + if (state_ == NULL) { + return "OK"; + } + + const char* type; + switch (code()) { + case StatusCode::OK: + type = "OK"; + break; + case StatusCode::OutOfMemory: + type = "Out of memory"; + break; + case StatusCode::KeyError: + type = "Key error"; + break; + case StatusCode::Invalid: + type = "Invalid"; + break; + case StatusCode::NotImplemented: + type = "NotImplemented"; + break; + } + return std::string(type); +} + +std::string Status::ToString() const { + std::string result(CodeAsString()); + if (state_ == NULL) { + return result; + } + + result.append(": "); + + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(reinterpret_cast(state_ + 7), length); + return result; +} + } // namespace arrow diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index df55bfac9eb4a..8fdd829010eef 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -45,6 +45,12 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() +find_program(CCACHE_FOUND ccache) +if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) +endif(CCACHE_FOUND) + ############################################################ # Compiler flags ############################################################ @@ -389,7 +395,12 @@ add_subdirectory(src/pyarrow) add_subdirectory(src/pyarrow/util) set(PYARROW_SRCS + src/pyarrow/common.cc + src/pyarrow/helpers.cc src/pyarrow/init.cc + src/pyarrow/status.cc + + src/pyarrow/adapters/builtin.cc ) set(LINK_LIBS @@ -410,18 +421,16 @@ endif() # Setup and build Cython modules ############################################################ -foreach(pyx_api_file - arrow/config.pyx - arrow/parquet.pyx) - set_source_files_properties(${pyx_api_file} PROPERTIES CYTHON_API 1) -endforeach(pyx_api_file) - set(USE_RELATIVE_RPATH ON) set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) set(CYTHON_EXTENSIONS + array config + error parquet + scalar + schema ) foreach(module ${CYTHON_EXTENSIONS}) diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index e69de29bb2d1d..3c049b85e8c93 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# flake8: noqa + +from arrow.array import (Array, from_pylist, total_allocated_bytes, + BooleanArray, NumericArray, + Int8Array, UInt8Array, + ListArray, StringArray) + +from arrow.error import ArrowException + +from arrow.scalar import ArrayValue, NA, Scalar + +from arrow.schema import (null, bool_, + int8, int16, int32, int64, + uint8, uint16, uint32, uint64, + float_, double, string, + list_, struct, field, + DataType, Field, Schema) diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd new file mode 100644 index 0000000000000..e32d27769b5e1 --- /dev/null +++ b/python/arrow/array.pxd @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport shared_ptr +from arrow.includes.arrow cimport CArray, LogicalType + +from arrow.scalar import NA + +from arrow.schema cimport DataType + +cdef extern from "Python.h": + int PySlice_Check(object) + +cdef class Array: + cdef: + shared_ptr[CArray] sp_array + CArray* ap + + cdef readonly: + DataType type + + cdef init(self, const shared_ptr[CArray]& sp_array) + cdef _getitem(self, int i) + + +cdef class BooleanArray(Array): + pass + + +cdef class NumericArray(Array): + pass + + +cdef class Int8Array(NumericArray): + pass + + +cdef class UInt8Array(NumericArray): + pass + + +cdef class Int16Array(NumericArray): + pass + + +cdef class UInt16Array(NumericArray): + pass + + +cdef class Int32Array(NumericArray): + pass + + +cdef class UInt32Array(NumericArray): + pass + + +cdef class Int64Array(NumericArray): + pass + + +cdef class UInt64Array(NumericArray): + pass + + +cdef class ListArray(Array): + pass + + +cdef class StringArray(Array): + pass diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx new file mode 100644 index 0000000000000..3a3210d6cc100 --- /dev/null +++ b/python/arrow/array.pyx @@ -0,0 +1,179 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from arrow.includes.arrow cimport * +cimport arrow.includes.pyarrow as pyarrow + +from arrow.compat import frombytes, tobytes +from arrow.error cimport check_status + +from arrow.scalar import NA + +def total_allocated_bytes(): + cdef MemoryPool* pool = pyarrow.GetMemoryPool() + return pool.bytes_allocated() + + +cdef class Array: + + cdef init(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + self.ap = sp_array.get() + self.type = DataType() + self.type.init(self.sp_array.get().type()) + + property null_count: + + def __get__(self): + return self.sp_array.get().null_count() + + def __len__(self): + return self.sp_array.get().length() + + def isnull(self): + raise NotImplemented + + def __getitem__(self, key): + cdef: + Py_ssize_t n = len(self) + + if PySlice_Check(key): + start = key.start or 0 + while start < 0: + start += n + + stop = key.stop if key.stop is not None else n + while stop < 0: + stop += n + + step = key.step or 1 + if step != 1: + raise NotImplementedError + else: + return self.slice(start, stop) + + while key < 0: + key += len(self) + + if self.ap.IsNull(key): + return NA + else: + return self._getitem(key) + + cdef _getitem(self, int i): + raise NotImplementedError + + def slice(self, start, end): + pass + + +cdef class NullArray(Array): + pass + + +cdef class BooleanArray(Array): + pass + + +cdef class NumericArray(Array): + pass + + +cdef class Int8Array(NumericArray): + pass + + +cdef class UInt8Array(NumericArray): + pass + + +cdef class Int16Array(NumericArray): + pass + + +cdef class UInt16Array(NumericArray): + pass + + +cdef class Int32Array(NumericArray): + pass + + +cdef class UInt32Array(NumericArray): + pass + + +cdef class Int64Array(NumericArray): + pass + + +cdef class UInt64Array(NumericArray): + pass + + +cdef class FloatArray(NumericArray): + pass + + +cdef class DoubleArray(NumericArray): + pass + + +cdef class ListArray(Array): + pass + + +cdef class StringArray(Array): + pass + + +cdef dict _array_classes = { + LogicalType_NA: NullArray, + LogicalType_BOOL: BooleanArray, + LogicalType_INT64: Int64Array, + LogicalType_DOUBLE: DoubleArray, + LogicalType_LIST: ListArray, + LogicalType_STRING: StringArray, +} + +cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): + if sp_array.get() == NULL: + raise ValueError('Array was NULL') + + cdef CDataType* data_type = sp_array.get().type().get() + + if data_type == NULL: + raise ValueError('Array data type was NULL') + + cdef Array arr = _array_classes[data_type.type]() + arr.init(sp_array) + return arr + + +def from_pylist(object list_obj, type=None): + """ + Convert Python list to Arrow array + """ + cdef: + shared_ptr[CArray] sp_array + + check_status(pyarrow.ConvertPySequence(list_obj, &sp_array)) + return box_arrow_array(sp_array) diff --git a/python/arrow/config.pyx b/python/arrow/config.pyx index 8f10beb3a2e72..521bc066cd4a5 100644 --- a/python/arrow/config.pyx +++ b/python/arrow/config.pyx @@ -2,7 +2,7 @@ # distutils: language = c++ # cython: embedsignature = True -cdef extern from 'pyarrow/init.h' namespace 'arrow::py': +cdef extern from 'pyarrow/init.h' namespace 'pyarrow': void pyarrow_init() pyarrow_init() diff --git a/python/arrow/error.pxd b/python/arrow/error.pxd new file mode 100644 index 0000000000000..c18cb3efffca6 --- /dev/null +++ b/python/arrow/error.pxd @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.pyarrow cimport * + +cdef check_status(const Status& status) diff --git a/python/arrow/error.pyx b/python/arrow/error.pyx new file mode 100644 index 0000000000000..f1d516358819d --- /dev/null +++ b/python/arrow/error.pyx @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport c_string + +from arrow.compat import frombytes + +class ArrowException(Exception): + pass + +cdef check_status(const Status& status): + if status.ok(): + return + + cdef c_string c_message = status.ToString() + raise ArrowException(frombytes(c_message)) diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index 3635ceb868596..fde5de910915a 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -20,4 +20,77 @@ from arrow.includes.common cimport * cdef extern from "arrow/api.h" namespace "arrow" nogil: - pass + + enum LogicalType" arrow::LogicalType::type": + LogicalType_NA" arrow::LogicalType::NA" + + LogicalType_BOOL" arrow::LogicalType::BOOL" + + LogicalType_UINT8" arrow::LogicalType::UINT8" + LogicalType_INT8" arrow::LogicalType::INT8" + LogicalType_UINT16" arrow::LogicalType::UINT16" + LogicalType_INT16" arrow::LogicalType::INT16" + LogicalType_UINT32" arrow::LogicalType::UINT32" + LogicalType_INT32" arrow::LogicalType::INT32" + LogicalType_UINT64" arrow::LogicalType::UINT64" + LogicalType_INT64" arrow::LogicalType::INT64" + + LogicalType_FLOAT" arrow::LogicalType::FLOAT" + LogicalType_DOUBLE" arrow::LogicalType::DOUBLE" + + LogicalType_STRING" arrow::LogicalType::STRING" + + LogicalType_LIST" arrow::LogicalType::LIST" + LogicalType_STRUCT" arrow::LogicalType::STRUCT" + + cdef cppclass CDataType" arrow::DataType": + LogicalType type + c_bool nullable + + c_bool Equals(const CDataType* other) + + c_string ToString() + + cdef cppclass MemoryPool" arrow::MemoryPool": + int64_t bytes_allocated() + + cdef cppclass CListType" arrow::ListType"(CDataType): + CListType(const shared_ptr[CDataType]& value_type, + c_bool nullable) + + cdef cppclass CStringType" arrow::StringType"(CDataType): + pass + + cdef cppclass CField" arrow::Field": + c_string name + shared_ptr[CDataType] type + + CField(const c_string& name, const shared_ptr[CDataType]& type) + + cdef cppclass CStructType" arrow::StructType"(CDataType): + CStructType(const vector[shared_ptr[CField]]& fields, + c_bool nullable) + + cdef cppclass CSchema" arrow::Schema": + CSchema(const shared_ptr[CField]& fields) + + cdef cppclass CArray" arrow::Array": + const shared_ptr[CDataType]& type() + + int32_t length() + int32_t null_count() + LogicalType logical_type() + + c_bool IsNull(int i) + + cdef cppclass CUInt8Array" arrow::UInt8Array"(CArray): + pass + + cdef cppclass CInt8Array" arrow::Int8Array"(CArray): + pass + + cdef cppclass CListArray" arrow::ListArray"(CArray): + pass + + cdef cppclass CStringArray" arrow::StringArray"(CListArray): + pass diff --git a/python/arrow/includes/common.pxd b/python/arrow/includes/common.pxd index f2fc826625e45..839427a699002 100644 --- a/python/arrow/includes/common.pxd +++ b/python/arrow/includes/common.pxd @@ -19,7 +19,7 @@ from libc.stdint cimport * from libcpp cimport bool as c_bool -from libcpp.string cimport string +from libcpp.string cimport string as c_string from libcpp.vector cimport vector # This must be included for cerr and other things to work @@ -29,6 +29,8 @@ cdef extern from "": cdef extern from "" namespace "std" nogil: cdef cppclass shared_ptr[T]: + shared_ptr() + shared_ptr(T*) T* get() void reset() void reset(T* p) diff --git a/python/arrow/includes/pyarrow.pxd b/python/arrow/includes/pyarrow.pxd index dcef663f3894d..3eed5b8542493 100644 --- a/python/arrow/includes/pyarrow.pxd +++ b/python/arrow/includes/pyarrow.pxd @@ -18,6 +18,28 @@ # distutils: language = c++ from arrow.includes.common cimport * +from arrow.includes.arrow cimport (CArray, CDataType, LogicalType, + MemoryPool) cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: - pass + # We can later add more of the common status factory methods as needed + cdef Status Status_OK "Status::OK"() + + cdef cppclass Status: + Status() + + c_string ToString() + + c_bool ok() + c_bool IsOutOfMemory() + c_bool IsKeyError() + c_bool IsTypeError() + c_bool IsIOError() + c_bool IsValueError() + c_bool IsNotImplemented() + c_bool IsArrowError() + + shared_ptr[CDataType] GetPrimitiveType(LogicalType type, c_bool nullable) + Status ConvertPySequence(object obj, shared_ptr[CArray]* out) + + MemoryPool* GetMemoryPool() diff --git a/python/arrow/scalar.pxd b/python/arrow/scalar.pxd new file mode 100644 index 0000000000000..e193c09cd69a2 --- /dev/null +++ b/python/arrow/scalar.pxd @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport * +from arrow.includes.arrow cimport CArray, CListArray + +from arrow.schema cimport DataType + +cdef class Scalar: + cdef readonly: + DataType type + + +cdef class NAType(Scalar): + pass + + +cdef class ArrayValue(Scalar): + cdef: + shared_ptr[CArray] array + int index + + +cdef class Int8Value(ArrayValue): + pass + + +cdef class ListValue(ArrayValue): + pass + + +cdef class StringValue(ArrayValue): + pass diff --git a/python/arrow/scalar.pyx b/python/arrow/scalar.pyx new file mode 100644 index 0000000000000..78dadecf9b422 --- /dev/null +++ b/python/arrow/scalar.pyx @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import arrow.schema as schema + +cdef class NAType(Scalar): + + def __cinit__(self): + self.type = schema.null() + + def __repr__(self): + return 'NA' + +NA = NAType() diff --git a/python/arrow/schema.pxd b/python/arrow/schema.pxd new file mode 100644 index 0000000000000..487c246f44abf --- /dev/null +++ b/python/arrow/schema.pxd @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport shared_ptr +from arrow.includes.arrow cimport CDataType, CField, CSchema + +cdef class DataType: + cdef: + shared_ptr[CDataType] sp_type + CDataType* type + + cdef init(self, const shared_ptr[CDataType]& type) + +cdef class Field: + cdef: + shared_ptr[CField] sp_field + CField* field + + cdef readonly: + DataType type + +cdef class Schema: + cdef: + shared_ptr[CSchema] sp_schema + CSchema* schema diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx new file mode 100644 index 0000000000000..63cd6e888abd0 --- /dev/null +++ b/python/arrow/schema.pyx @@ -0,0 +1,150 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +######################################## +# Data types, fields, schemas, and so forth + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from arrow.compat import frombytes, tobytes +from arrow.includes.arrow cimport * +cimport arrow.includes.pyarrow as pyarrow + +cimport cpython + +cdef class DataType: + + def __cinit__(self): + pass + + cdef init(self, const shared_ptr[CDataType]& type): + self.sp_type = type + self.type = type.get() + + def __str__(self): + return frombytes(self.type.ToString()) + + def __repr__(self): + return 'DataType({0})'.format(str(self)) + + def __richcmp__(DataType self, DataType other, int op): + if op == cpython.Py_EQ: + return self.type.Equals(other.type) + elif op == cpython.Py_NE: + return not self.type.Equals(other.type) + else: + raise TypeError('Invalid comparison') + + +cdef class Field: + + def __cinit__(self, object name, DataType type): + self.type = type + self.sp_field.reset(new CField(tobytes(name), type.sp_type)) + self.field = self.sp_field.get() + + def __repr__(self): + return 'Field({0!r}, type={1})'.format(self.name, str(self.type)) + + property name: + + def __get__(self): + return frombytes(self.field.name) + +cdef dict _type_cache = {} + +cdef DataType primitive_type(LogicalType type, bint nullable=True): + if (type, nullable) in _type_cache: + return _type_cache[type, nullable] + + cdef DataType out = DataType() + out.init(pyarrow.GetPrimitiveType(type, nullable)) + + _type_cache[type, nullable] = out + return out + +#------------------------------------------------------------ +# Type factory functions + +def field(name, type): + return Field(name, type) + +def null(): + return primitive_type(LogicalType_NA) + +def bool_(c_bool nullable=True): + return primitive_type(LogicalType_BOOL, nullable) + +def uint8(c_bool nullable=True): + return primitive_type(LogicalType_UINT8, nullable) + +def int8(c_bool nullable=True): + return primitive_type(LogicalType_INT8, nullable) + +def uint16(c_bool nullable=True): + return primitive_type(LogicalType_UINT16, nullable) + +def int16(c_bool nullable=True): + return primitive_type(LogicalType_INT16, nullable) + +def uint32(c_bool nullable=True): + return primitive_type(LogicalType_UINT32, nullable) + +def int32(c_bool nullable=True): + return primitive_type(LogicalType_INT32, nullable) + +def uint64(c_bool nullable=True): + return primitive_type(LogicalType_UINT64, nullable) + +def int64(c_bool nullable=True): + return primitive_type(LogicalType_INT64, nullable) + +def float_(c_bool nullable=True): + return primitive_type(LogicalType_FLOAT, nullable) + +def double(c_bool nullable=True): + return primitive_type(LogicalType_DOUBLE, nullable) + +def string(c_bool nullable=True): + """ + UTF8 string + """ + return primitive_type(LogicalType_STRING, nullable) + +def list_(DataType value_type, c_bool nullable=True): + cdef DataType out = DataType() + out.init(shared_ptr[CDataType]( + new CListType(value_type.sp_type, nullable))) + return out + +def struct(fields, c_bool nullable=True): + """ + + """ + cdef: + DataType out = DataType() + Field field + vector[shared_ptr[CField]] c_fields + + for field in fields: + c_fields.push_back(field.sp_field) + + out.init(shared_ptr[CDataType]( + new CStructType(c_fields, nullable))) + return out diff --git a/python/arrow/tests/test_array.py b/python/arrow/tests/test_array.py new file mode 100644 index 0000000000000..8eaa53352061b --- /dev/null +++ b/python/arrow/tests/test_array.py @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest +import arrow + + +class TestArrayAPI(unittest.TestCase): + + def test_getitem_NA(self): + arr = arrow.from_pylist([1, None, 2]) + assert arr[1] is arrow.NA diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py new file mode 100644 index 0000000000000..57e6ab9f0e7b5 --- /dev/null +++ b/python/arrow/tests/test_convert_builtin.py @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest +import arrow + + +class TestConvertList(unittest.TestCase): + + def test_boolean(self): + pass + + def test_empty_list(self): + arr = arrow.from_pylist([]) + assert len(arr) == 0 + assert arr.null_count == 0 + assert arr.type == arrow.null() + + def test_all_none(self): + arr = arrow.from_pylist([None, None]) + assert len(arr) == 2 + assert arr.null_count == 2 + assert arr.type == arrow.null() + + def test_integer(self): + arr = arrow.from_pylist([1, None, 3, None]) + assert len(arr) == 4 + assert arr.null_count == 2 + assert arr.type == arrow.int64() + + def test_garbage_collection(self): + import gc + bytes_before = arrow.total_allocated_bytes() + arrow.from_pylist([1, None, 3, None]) + gc.collect() + assert arrow.total_allocated_bytes() == bytes_before + + def test_double(self): + data = [1.5, 1, None, 2.5, None, None] + arr = arrow.from_pylist(data) + assert len(arr) == 6 + assert arr.null_count == 3 + assert arr.type == arrow.double() + + def test_string(self): + data = ['foo', b'bar', None, 'arrow'] + arr = arrow.from_pylist(data) + assert len(arr) == 4 + assert arr.null_count == 1 + assert arr.type == arrow.string() + + def test_mixed_nesting_levels(self): + arrow.from_pylist([1, 2, None]) + arrow.from_pylist([[1], [2], None]) + arrow.from_pylist([[1], [2], [None]]) + + with self.assertRaises(arrow.ArrowException): + arrow.from_pylist([1, 2, [1]]) + + with self.assertRaises(arrow.ArrowException): + arrow.from_pylist([1, 2, []]) + + with self.assertRaises(arrow.ArrowException): + arrow.from_pylist([[1], [2], [None, [1]]]) + + def test_list_of_int(self): + data = [[1, 2, 3], [], None, [1, 2]] + arr = arrow.from_pylist(data) + assert len(arr) == 4 + assert arr.null_count == 1 + assert arr.type == arrow.list_(arrow.int64()) diff --git a/python/arrow/tests/test_schema.py b/python/arrow/tests/test_schema.py new file mode 100644 index 0000000000000..a89edd74a0adf --- /dev/null +++ b/python/arrow/tests/test_schema.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest +import arrow + + +class TestTypes(unittest.TestCase): + + def test_integers(self): + dtypes = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64'] + + for name in dtypes: + factory = getattr(arrow, name) + t = factory() + t_required = factory(False) + + assert str(t) == name + assert str(t_required) == '{0} not null'.format(name) + + def test_list(self): + value_type = arrow.int32() + list_type = arrow.list_(value_type) + assert str(list_type) == 'list' + + def test_string(self): + t = arrow.string() + assert str(t) == 'string' + + def test_field(self): + t = arrow.string() + f = arrow.field('foo', t) + + assert f.name == 'foo' + assert f.type is t + assert repr(f) == "Field('foo', type=string)" diff --git a/python/setup.py b/python/setup.py index f6b0a4bee8316..9a0de071a9c40 100644 --- a/python/setup.py +++ b/python/setup.py @@ -124,7 +124,10 @@ def _run_cmake(self): static_lib_option, source] self.spawn(cmake_command) - self.spawn(['make']) + args = ['make'] + if 'PYARROW_PARALLEL' in os.environ: + args.append('-j{0}'.format(os.environ['PYARROW_PARALLEL'])) + self.spawn(args) else: import shlex cmake_generator = 'Visual Studio 14 2015' @@ -207,7 +210,7 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['config', 'parquet'] + return ['array', 'config', 'error', 'parquet', 'scalar', 'schema'] def get_names(self): return self._found_names diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc new file mode 100644 index 0000000000000..ae84fa12b0de6 --- /dev/null +++ b/python/src/pyarrow/adapters/builtin.cc @@ -0,0 +1,415 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "pyarrow/adapters/builtin.h" + +#include + +#include "pyarrow/status.h" + +using arrow::ArrayBuilder; +using arrow::DataType; +using arrow::LogicalType; + +namespace pyarrow { + +static inline bool IsPyInteger(PyObject* obj) { +#if PYARROW_IS_PY2 + return PyLong_Check(obj) || PyInt_Check(obj); +#else + return PyLong_Check(obj); +#endif +} + +static inline bool IsPyBaseString(PyObject* obj) { +#if PYARROW_IS_PY2 + return PyString_Check(obj) || PyUnicode_Check(obj); +#else + return PyUnicode_Check(obj); +#endif +} + +class ScalarVisitor { + public: + ScalarVisitor() : + total_count_(0), + none_count_(0), + bool_count_(0), + int_count_(0), + float_count_(0), + string_count_(0) {} + + void Visit(PyObject* obj) { + ++total_count_; + if (obj == Py_None) { + ++none_count_; + } else if (PyFloat_Check(obj)) { + ++float_count_; + } else if (IsPyInteger(obj)) { + ++int_count_; + } else if (IsPyBaseString(obj)) { + ++string_count_; + } else { + // TODO(wesm): accumulate error information somewhere + } + } + + std::shared_ptr GetType() { + // TODO(wesm): handling mixed-type cases + if (float_count_) { + return arrow::DOUBLE; + } else if (int_count_) { + // TODO(wesm): tighter type later + return arrow::INT64; + } else if (bool_count_) { + return arrow::BOOL; + } else if (string_count_) { + return arrow::STRING; + } else { + return arrow::NA; + } + } + + int64_t total_count() const { + return total_count_; + } + + private: + int64_t total_count_; + int64_t none_count_; + int64_t bool_count_; + int64_t int_count_; + int64_t float_count_; + int64_t string_count_; + + // Place to accumulate errors + // std::vector errors_; +}; + +static constexpr int MAX_NESTING_LEVELS = 32; + +class SeqVisitor { + public: + SeqVisitor() : + max_nesting_level_(0) { + memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int)); + } + + Status Visit(PyObject* obj, int level=0) { + Py_ssize_t size = PySequence_Size(obj); + + if (level > max_nesting_level_) { + max_nesting_level_ = level; + } + + for (int64_t i = 0; i < size; ++i) { + // TODO(wesm): Error checking? + // TODO(wesm): Specialize for PyList_GET_ITEM? + OwnedRef item_ref(PySequence_GetItem(obj, i)); + PyObject* item = item_ref.obj(); + + if (PyList_Check(item)) { + PY_RETURN_NOT_OK(Visit(item, level + 1)); + } else if (PyDict_Check(item)) { + return Status::NotImplemented("No type inference for dicts"); + } else { + // We permit nulls at any level of nesting + if (item == Py_None) { + // TODO + } else { + ++nesting_histogram_[level]; + scalars_.Visit(item); + } + } + } + return Status::OK(); + } + + std::shared_ptr GetType() { + if (scalars_.total_count() == 0) { + if (max_nesting_level_ == 0) { + return arrow::NA; + } else { + return nullptr; + } + } else { + std::shared_ptr result = scalars_.GetType(); + for (int i = 0; i < max_nesting_level_; ++i) { + result = std::make_shared(result); + } + return result; + } + } + + Status Validate() const { + if (scalars_.total_count() > 0) { + if (num_nesting_levels() > 1) { + return Status::ValueError("Mixed nesting levels not supported"); + } else if (max_observed_level() < max_nesting_level_) { + return Status::ValueError("Mixed nesting levels not supported"); + } + } + return Status::OK(); + } + + int max_observed_level() const { + int result = 0; + for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { + if (nesting_histogram_[i] > 0) { + result = i; + } + } + return result; + } + + int num_nesting_levels() const { + int result = 0; + for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { + if (nesting_histogram_[i] > 0) { + ++result; + } + } + return result; + } + + private: + ScalarVisitor scalars_; + + // Track observed + int max_nesting_level_; + int nesting_histogram_[MAX_NESTING_LEVELS]; +}; + +// Non-exhaustive type inference +static Status InferArrowType(PyObject* obj, int64_t* size, + std::shared_ptr* out_type) { + *size = PySequence_Size(obj); + if (PyErr_Occurred()) { + // Not a sequence + PyErr_Clear(); + return Status::TypeError("Object is not a sequence"); + } + + // For 0-length sequences, refuse to guess + if (*size == 0) { + *out_type = arrow::NA; + } + + SeqVisitor seq_visitor; + PY_RETURN_NOT_OK(seq_visitor.Visit(obj)); + PY_RETURN_NOT_OK(seq_visitor.Validate()); + + *out_type = seq_visitor.GetType(); + return Status::OK(); +} + +// Marshal Python sequence (list, tuple, etc.) to Arrow array +class SeqConverter { + public: + virtual Status Init(const std::shared_ptr& builder) { + builder_ = builder; + return Status::OK(); + } + + virtual Status AppendData(PyObject* seq) = 0; + + protected: + std::shared_ptr builder_; +}; + +template +class TypedConverter : public SeqConverter { + public: + Status Init(const std::shared_ptr& builder) override { + builder_ = builder; + typed_builder_ = static_cast(builder.get()); + return Status::OK(); + } + + protected: + BuilderType* typed_builder_; +}; + +class BoolConverter : public TypedConverter { + public: + Status AppendData(PyObject* seq) override { + return Status::OK(); + } +}; + +class Int64Converter : public TypedConverter { + public: + Status AppendData(PyObject* seq) override { + int64_t val; + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + } else { + val = PyLong_AsLongLong(item.obj()); + RETURN_IF_PYERROR(); + RETURN_ARROW_NOT_OK(typed_builder_->Append(val)); + } + } + return Status::OK(); + } +}; + +class DoubleConverter : public TypedConverter { + public: + Status AppendData(PyObject* seq) override { + int64_t val; + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + } else { + val = PyFloat_AsDouble(item.obj()); + RETURN_IF_PYERROR(); + RETURN_ARROW_NOT_OK(typed_builder_->Append(val)); + } + } + return Status::OK(); + } +}; + +class StringConverter : public TypedConverter { + public: + Status AppendData(PyObject* seq) override { + PyObject* item; + PyObject* bytes_obj; + OwnedRef tmp; + const char* bytes; + int32_t length; + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + item = PySequence_GetItem(seq, i); + OwnedRef holder(item); + + if (item == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + continue; + } else if (PyUnicode_Check(item)) { + tmp.reset(PyUnicode_AsUTF8String(item)); + RETURN_IF_PYERROR(); + bytes_obj = tmp.obj(); + } else if (PyBytes_Check(item)) { + bytes_obj = item; + } else { + return Status::TypeError("Non-string value encountered"); + } + // No error checking + length = PyBytes_GET_SIZE(bytes_obj); + bytes = PyBytes_AS_STRING(bytes_obj); + RETURN_ARROW_NOT_OK(typed_builder_->Append(bytes, length)); + } + return Status::OK(); + } +}; + +class ListConverter : public TypedConverter { + public: + Status Init(const std::shared_ptr& builder) override; + + Status AppendData(PyObject* seq) override { + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + } else { + typed_builder_->Append(); + PY_RETURN_NOT_OK(value_converter_->AppendData(item.obj())); + } + } + return Status::OK(); + } + protected: + std::shared_ptr value_converter_; +}; + +// Dynamic constructor for sequence converters +std::shared_ptr GetConverter(const std::shared_ptr& type) { + switch (type->type) { + case LogicalType::BOOL: + return std::make_shared(); + case LogicalType::INT64: + return std::make_shared(); + case LogicalType::DOUBLE: + return std::make_shared(); + case LogicalType::STRING: + return std::make_shared(); + case LogicalType::LIST: + return std::make_shared(); + case LogicalType::STRUCT: + default: + return nullptr; + break; + } +} + +Status ListConverter::Init(const std::shared_ptr& builder) { + builder_ = builder; + typed_builder_ = static_cast(builder.get()); + + value_converter_ = GetConverter(static_cast( + builder->type().get())->value_type); + if (value_converter_ == nullptr) { + return Status::NotImplemented("value type not implemented"); + } + + value_converter_->Init(typed_builder_->value_builder()); + return Status::OK(); +} + +Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { + std::shared_ptr type; + int64_t size; + PY_RETURN_NOT_OK(InferArrowType(obj, &size, &type)); + + // Handle NA / NullType case + if (type->type == LogicalType::NA) { + out->reset(new arrow::Array(type, size, size)); + return Status::OK(); + } + + std::shared_ptr converter = GetConverter(type); + if (converter == nullptr) { + std::stringstream ss; + ss << "No type converter implemented for " + << type->ToString(); + return Status::NotImplemented(ss.str()); + } + + // Give the sequence converter an array builder + std::shared_ptr builder; + RETURN_ARROW_NOT_OK(arrow::MakeBuilder(GetMemoryPool(), type, &builder)); + converter->Init(builder); + + PY_RETURN_NOT_OK(converter->AppendData(obj)); + + *out = builder->Finish(); + + return Status::OK(); +} + +} // namespace pyarrow diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h new file mode 100644 index 0000000000000..24886f4970d50 --- /dev/null +++ b/python/src/pyarrow/adapters/builtin.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between CPython built-in data structures and Arrow +// data structures + +#ifndef PYARROW_ADAPTERS_BUILTIN_H +#define PYARROW_ADAPTERS_BUILTIN_H + +#include + +#include + +#include "pyarrow/common.h" + +namespace arrow { class Array; } + +namespace pyarrow { + +class Status; + +Status ConvertPySequence(PyObject* obj, std::shared_ptr* out); + +} // namespace pyarrow + +#endif // PYARROW_ADAPTERS_BUILTIN_H diff --git a/cpp/src/arrow/field.cc b/python/src/pyarrow/adapters/pandas.h similarity index 76% rename from cpp/src/arrow/field.cc rename to python/src/pyarrow/adapters/pandas.h index 4568d905c2991..a4f4163808711 100644 --- a/cpp/src/arrow/field.cc +++ b/python/src/pyarrow/adapters/pandas.h @@ -15,17 +15,14 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/field.h" +// Functions for converting between pandas's NumPy-based data representation +// and Arrow data structures -#include -#include +#ifndef PYARROW_ADAPTERS_PANDAS_H +#define PYARROW_ADAPTERS_PANDAS_H -namespace arrow { +namespace pyarrow { -std::string Field::ToString() const { - std::stringstream ss; - ss << this->name << " " << this->type->ToString(); - return ss.str(); -} +} // namespace pyarrow -} // namespace arrow +#endif // PYARROW_ADAPTERS_PANDAS_H diff --git a/python/src/pyarrow/api.h b/python/src/pyarrow/api.h index c2285de77bf10..72be6afe02c76 100644 --- a/python/src/pyarrow/api.h +++ b/python/src/pyarrow/api.h @@ -18,4 +18,11 @@ #ifndef PYARROW_API_H #define PYARROW_API_H +#include "pyarrow/status.h" + +#include "pyarrow/helpers.h" + +#include "pyarrow/adapters/builtin.h" +#include "pyarrow/adapters/pandas.h" + #endif // PYARROW_API_H diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc new file mode 100644 index 0000000000000..a2748f99b6733 --- /dev/null +++ b/python/src/pyarrow/common.cc @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "pyarrow/common.h" + +#include +#include +#include + +#include +#include + +#include "pyarrow/status.h" + +namespace pyarrow { + +class PyArrowMemoryPool : public arrow::MemoryPool { + public: + PyArrowMemoryPool() : bytes_allocated_(0) {} + virtual ~PyArrowMemoryPool() {} + + arrow::Status Allocate(int64_t size, uint8_t** out) override { + std::lock_guard guard(pool_lock_); + *out = static_cast(std::malloc(size)); + if (*out == nullptr) { + std::stringstream ss; + ss << "malloc of size " << size << " failed"; + return arrow::Status::OutOfMemory(ss.str()); + } + + bytes_allocated_ += size; + + return arrow::Status::OK(); + } + + int64_t bytes_allocated() const override { + std::lock_guard guard(pool_lock_); + return bytes_allocated_; + } + + void Free(uint8_t* buffer, int64_t size) override { + std::lock_guard guard(pool_lock_); + std::free(buffer); + bytes_allocated_ -= size; + } + + private: + mutable std::mutex pool_lock_; + int64_t bytes_allocated_; +}; + +arrow::MemoryPool* GetMemoryPool() { + static PyArrowMemoryPool memory_pool; + return &memory_pool; +} + +} // namespace pyarrow diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h new file mode 100644 index 0000000000000..a43e4d28c899a --- /dev/null +++ b/python/src/pyarrow/common.h @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_COMMON_H +#define PYARROW_COMMON_H + +#include + +namespace arrow { class MemoryPool; } + +namespace pyarrow { + +#define PYARROW_IS_PY2 PY_MAJOR_VERSION < 2 + +#define RETURN_ARROW_NOT_OK(s) do { \ + arrow::Status _s = (s); \ + if (!_s.ok()) { \ + return Status::ArrowError(s.ToString()); \ + } \ + } while (0); + +class OwnedRef { + public: + OwnedRef() : obj_(nullptr) {} + + OwnedRef(PyObject* obj) : + obj_(obj) {} + + ~OwnedRef() { + Py_XDECREF(obj_); + } + + void reset(PyObject* obj) { + if (obj_ != nullptr) { + Py_XDECREF(obj_); + } + obj_ = obj; + } + + PyObject* obj() const{ + return obj_; + } + + private: + PyObject* obj_; +}; + +struct PyObjectStringify { + OwnedRef tmp_obj; + const char* bytes; + + PyObjectStringify(PyObject* obj) { + PyObject* bytes_obj; + if (PyUnicode_Check(obj)) { + bytes_obj = PyUnicode_AsUTF8String(obj); + tmp_obj.reset(bytes_obj); + } else { + bytes_obj = obj; + } + bytes = PyBytes_AsString(bytes_obj); + } +}; + +// TODO(wesm): We can just let errors pass through. To be explored later +#define RETURN_IF_PYERROR() \ + if (PyErr_Occurred()) { \ + PyObject *exc_type, *exc_value, *traceback; \ + PyErr_Fetch(&exc_type, &exc_value, &traceback); \ + PyObjectStringify stringified(exc_value); \ + std::string message(stringified.bytes); \ + Py_DECREF(exc_type); \ + Py_DECREF(exc_value); \ + Py_DECREF(traceback); \ + return Status::UnknownError(message); \ + } + +arrow::MemoryPool* GetMemoryPool(); + +} // namespace pyarrow + +#endif // PYARROW_COMMON_H diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc new file mode 100644 index 0000000000000..d0969dacc21e0 --- /dev/null +++ b/python/src/pyarrow/helpers.cc @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "pyarrow/helpers.h" + +#include + +using namespace arrow; + +namespace pyarrow { + +#define GET_PRIMITIVE_TYPE(NAME, Type) \ + case LogicalType::NAME: \ + if (nullable) { \ + return NAME; \ + } else { \ + return std::make_shared(nullable); \ + } \ + break; + +std::shared_ptr GetPrimitiveType(LogicalType::type type, + bool nullable) { + switch (type) { + case LogicalType::NA: + return NA; + GET_PRIMITIVE_TYPE(UINT8, UInt8Type); + GET_PRIMITIVE_TYPE(INT8, Int8Type); + GET_PRIMITIVE_TYPE(UINT16, UInt16Type); + GET_PRIMITIVE_TYPE(INT16, Int16Type); + GET_PRIMITIVE_TYPE(UINT32, UInt32Type); + GET_PRIMITIVE_TYPE(INT32, Int32Type); + GET_PRIMITIVE_TYPE(UINT64, UInt64Type); + GET_PRIMITIVE_TYPE(INT64, Int64Type); + GET_PRIMITIVE_TYPE(BOOL, BooleanType); + GET_PRIMITIVE_TYPE(FLOAT, FloatType); + GET_PRIMITIVE_TYPE(DOUBLE, DoubleType); + GET_PRIMITIVE_TYPE(STRING, StringType); + default: + return nullptr; + } +} + +} // namespace pyarrow diff --git a/cpp/src/arrow/types/null.h b/python/src/pyarrow/helpers.h similarity index 72% rename from cpp/src/arrow/types/null.h rename to python/src/pyarrow/helpers.h index c67f752d40989..1a24f056febe6 100644 --- a/cpp/src/arrow/types/null.h +++ b/python/src/pyarrow/helpers.h @@ -15,20 +15,20 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_TYPES_NULL_H -#define ARROW_TYPES_NULL_H +#ifndef PYARROW_HELPERS_H +#define PYARROW_HELPERS_H -#include -#include +#include +#include -#include "arrow/type.h" +namespace pyarrow { -namespace arrow { +using arrow::DataType; +using arrow::LogicalType; -struct NullType : public PrimitiveType { - PRIMITIVE_DECL(NullType, void, NA, 0, "null"); -}; +std::shared_ptr GetPrimitiveType(LogicalType::type type, + bool nullable); -} // namespace arrow +} // namespace pyarrow -#endif // ARROW_TYPES_NULL_H +#endif // PYARROW_HELPERS_H diff --git a/python/src/pyarrow/init.cc b/python/src/pyarrow/init.cc index c36f413725532..acd851e168743 100644 --- a/python/src/pyarrow/init.cc +++ b/python/src/pyarrow/init.cc @@ -17,13 +17,9 @@ #include "pyarrow/init.h" -namespace arrow { - -namespace py { +namespace pyarrow { void pyarrow_init() { } -} // namespace py - -} // namespace arrow +} // namespace pyarrow diff --git a/python/src/pyarrow/init.h b/python/src/pyarrow/init.h index 1fc9f10102696..71e67a20c1ca5 100644 --- a/python/src/pyarrow/init.h +++ b/python/src/pyarrow/init.h @@ -18,14 +18,10 @@ #ifndef PYARROW_INIT_H #define PYARROW_INIT_H -namespace arrow { - -namespace py { +namespace pyarrow { void pyarrow_init(); -} // namespace py - -} // namespace arrow +} // namespace pyarrow #endif // PYARROW_INIT_H diff --git a/python/src/pyarrow/status.cc b/python/src/pyarrow/status.cc new file mode 100644 index 0000000000000..1cd54f6a78560 --- /dev/null +++ b/python/src/pyarrow/status.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#include "pyarrow/status.h" + +#include +#include +#include + +namespace pyarrow { + +Status::Status(StatusCode code, const std::string& msg, int16_t posix_code) { + assert(code != StatusCode::OK); + const uint32_t size = msg.size(); + char* result = new char[size + 7]; + memcpy(result, &size, sizeof(size)); + result[4] = static_cast(code); + memcpy(result + 5, &posix_code, sizeof(posix_code)); + memcpy(result + 7, msg.c_str(), msg.size()); + state_ = result; +} + +const char* Status::CopyState(const char* state) { + uint32_t size; + memcpy(&size, state, sizeof(size)); + char* result = new char[size + 7]; + memcpy(result, state, size + 7); + return result; +} + +std::string Status::CodeAsString() const { + if (state_ == NULL) { + return "OK"; + } + + const char* type; + switch (code()) { + case StatusCode::OK: + type = "OK"; + break; + case StatusCode::OutOfMemory: + type = "Out of memory"; + break; + case StatusCode::KeyError: + type = "Key error"; + break; + case StatusCode::TypeError: + type = "Value error"; + break; + case StatusCode::ValueError: + type = "Value error"; + break; + case StatusCode::IOError: + type = "IO error"; + break; + case StatusCode::NotImplemented: + type = "Not implemented"; + break; + case StatusCode::ArrowError: + type = "Arrow C++ error"; + break; + case StatusCode::UnknownError: + type = "Unknown error"; + break; + } + return std::string(type); +} + +std::string Status::ToString() const { + std::string result(CodeAsString()); + if (state_ == NULL) { + return result; + } + + result.append(": "); + + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(reinterpret_cast(state_ + 7), length); + return result; +} + +} // namespace pyarrow diff --git a/python/src/pyarrow/status.h b/python/src/pyarrow/status.h new file mode 100644 index 0000000000000..cb8c8add210e4 --- /dev/null +++ b/python/src/pyarrow/status.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#ifndef PYARROW_STATUS_H_ +#define PYARROW_STATUS_H_ + +#include +#include +#include + +namespace pyarrow { + +#define PY_RETURN_NOT_OK(s) do { \ + Status _s = (s); \ + if (!_s.ok()) return _s; \ + } while (0); + +enum class StatusCode: char { + OK = 0, + OutOfMemory = 1, + KeyError = 2, + TypeError = 3, + ValueError = 4, + IOError = 5, + NotImplemented = 6, + + ArrowError = 7, + + UnknownError = 10 +}; + +class Status { + public: + // Create a success status. + Status() : state_(NULL) { } + ~Status() { delete[] state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status OutOfMemory(const std::string& msg, int16_t posix_code = -1) { + return Status(StatusCode::OutOfMemory, msg, posix_code); + } + + static Status KeyError(const std::string& msg) { + return Status(StatusCode::KeyError, msg, -1); + } + + static Status TypeError(const std::string& msg) { + return Status(StatusCode::TypeError, msg, -1); + } + + static Status IOError(const std::string& msg) { + return Status(StatusCode::IOError, msg, -1); + } + + static Status ValueError(const std::string& msg) { + return Status(StatusCode::ValueError, msg, -1); + } + + static Status NotImplemented(const std::string& msg) { + return Status(StatusCode::NotImplemented, msg, -1); + } + + static Status UnknownError(const std::string& msg) { + return Status(StatusCode::UnknownError, msg, -1); + } + + static Status ArrowError(const std::string& msg) { + return Status(StatusCode::ArrowError, msg, -1); + } + + // Returns true iff the status indicates success. + bool ok() const { return (state_ == NULL); } + + bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } + bool IsKeyError() const { return code() == StatusCode::KeyError; } + bool IsIOError() const { return code() == StatusCode::IOError; } + bool IsTypeError() const { return code() == StatusCode::TypeError; } + bool IsValueError() const { return code() == StatusCode::ValueError; } + + bool IsUnknownError() const { return code() == StatusCode::UnknownError; } + + bool IsArrowError() const { return code() == StatusCode::ArrowError; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + // Return a string representation of the status code, without the message + // text or posix code information. + std::string CodeAsString() const; + + // Get the POSIX code associated with this Status, or -1 if there is none. + int16_t posix_code() const; + + private: + // OK status has a NULL state_. Otherwise, state_ is a new[] array + // of the following form: + // state_[0..3] == length of message + // state_[4] == code + // state_[5..6] == posix_code + // state_[7..] == message + const char* state_; + + StatusCode code() const { + return ((state_ == NULL) ? + StatusCode::OK : static_cast(state_[4])); + } + + Status(StatusCode code, const std::string& msg, int16_t posix_code); + static const char* CopyState(const char* s); +}; + +inline Status::Status(const Status& s) { + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); +} + +inline void Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (state_ != s.state_) { + delete[] state_; + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); + } +} + +} // namespace pyarrow + +#endif // PYARROW_STATUS_H_