From 936955baf05d3bcdd5bbfefeb5a1215565dd9418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 16 Dec 2019 16:02:53 +0100 Subject: [PATCH 1/5] Remove the unbox indirection --- cpp/src/arrow/python/python_to_arrow.cc | 199 ++++++++++++++---------- 1 file changed, 114 insertions(+), 85 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index d2a58067ebfe1..98e14980bd9a7 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -74,6 +74,10 @@ class SeqConverter { // converting Python objects to Arrow nested types virtual Status Init(ArrayBuilder* builder) = 0; + // Retrieve a Scalar obect from a single (non-sequence) Python datum, + // virtual function + // virtual Status GetScalar(PyObject* obj) = 0; + // Append a single (non-sequence) Python datum to the underlying builder, // virtual function virtual Status AppendSingleVirtual(PyObject* obj) = 0; @@ -129,70 +133,6 @@ struct NullChecker { static inline bool Check(PyObject* obj) { return internal::PandasObjectIsNull(obj); } }; -// ---------------------------------------------------------------------- -// Helper templates to append PyObject* to builder for each target conversion -// type - -template -struct Unbox {}; - -template -struct Unbox> { - using BuilderType = typename TypeTraits::BuilderType; - static inline Status Append(BuilderType* builder, PyObject* obj) { - typename Type::c_type value; - RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); - return builder->Append(value); - } -}; - -template <> -struct Unbox { - static inline Status Append(HalfFloatBuilder* builder, PyObject* obj) { - npy_half val; - RETURN_NOT_OK(PyFloat_AsHalf(obj, &val)); - return builder->Append(val); - } -}; - -template <> -struct Unbox { - static inline Status Append(FloatBuilder* builder, PyObject* obj) { - if (internal::PyFloatScalar_Check(obj)) { - float val = static_cast(PyFloat_AsDouble(obj)); - RETURN_IF_PYERROR(); - return builder->Append(val); - } else if (internal::PyIntScalar_Check(obj)) { - float val = 0; - RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &val)); - return builder->Append(val); - } else { - return internal::InvalidValue(obj, "tried to convert to float32"); - } - } -}; - -template <> -struct Unbox { - static inline Status Append(DoubleBuilder* builder, PyObject* obj) { - if (PyFloat_Check(obj)) { - double val = PyFloat_AS_DOUBLE(obj); - return builder->Append(val); - } else if (internal::PyFloatScalar_Check(obj)) { - // Other kinds of float-y things - double val = PyFloat_AsDouble(obj); - RETURN_IF_PYERROR(); - return builder->Append(val); - } else if (internal::PyIntScalar_Check(obj)) { - double val = 0; - RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &val)); - return builder->Append(val); - } else { - return internal::InvalidValue(obj, "tried to convert to double"); - } - } -}; - // We use CRTP to avoid virtual calls to the AppendItem(), AppendNull(), and // IsNull() on the hot path template @@ -212,10 +152,6 @@ class TypedConverter : public SeqConverter { // Append a missing item (default implementation) Status AppendNull() { return this->typed_builder_->AppendNull(); } - // This is overridden in several subclasses, but if an Unbox implementation - // is defined, it will be used here - Status AppendItem(PyObject* obj) { return Unbox::Append(typed_builder_, obj); } - Status AppendSingle(PyObject* obj) { auto self = checked_cast(this); return CheckNull(obj) ? self->AppendNull() : self->AppendItem(obj); @@ -285,11 +221,75 @@ class BoolConverter }; // ---------------------------------------------------------------------- -// Sequence converter template for numeric (integer and floating point) types +// Sequence converter template for integer types -template -class NumericConverter - : public TypedConverter, null_coding> {}; +template > +class IntegerConverter + : public TypedConverter, null_coding> { + public: + Status AppendItem(PyObject* obj) { + typename Type::c_type value; + RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); + return this->typed_builder_->Append(value); + } + +}; + +// ---------------------------------------------------------------------- +// Sequence converter template for floating types (float and double) + +template +class HalfFloatConverter + : public TypedConverter, null_coding> { + public: + Status AppendItem(PyObject* obj) { + npy_half val; + RETURN_NOT_OK(PyFloat_AsHalf(obj, &val)); + return this->typed_builder_->Append(val); + } +}; + +template +class FloatConverter + : public TypedConverter, null_coding> { + public: + Status AppendItem(PyObject* obj) { + if (internal::PyFloatScalar_Check(obj)) { + float val = static_cast(PyFloat_AsDouble(obj)); + RETURN_IF_PYERROR(); + return this->typed_builder_->Append(val); + } else if (internal::PyIntScalar_Check(obj)) { + float val = 0; + RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &val)); + return this->typed_builder_->Append(val); + } else { + return internal::InvalidValue(obj, "tried to convert to float32"); + } + } +}; + +template +class DoubleConverter + : public TypedConverter, null_coding> { + public: + Status AppendItem(PyObject* obj) { + if (PyFloat_Check(obj)) { + double val = PyFloat_AS_DOUBLE(obj); + return this->typed_builder_->Append(val); + } else if (internal::PyFloatScalar_Check(obj)) { + // Other kinds of float-y things + double val = PyFloat_AsDouble(obj); + RETURN_IF_PYERROR(); + return this->typed_builder_->Append(val); + } else if (internal::PyIntScalar_Check(obj)) { + double val = 0; + RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &val)); + return this->typed_builder_->Append(val); + } else { + return internal::InvalidValue(obj, "tried to convert to double"); + } + } +}; // ---------------------------------------------------------------------- // Sequence converters for temporal types @@ -1088,9 +1088,9 @@ class DecimalConverter std::shared_ptr decimal_type_; }; -#define NUMERIC_CONVERTER(TYPE_ENUM, TYPE) \ +#define INTEGER_CONVERTER_CASE(TYPE_ENUM, TYPE) \ case Type::TYPE_ENUM: \ - *out = std::unique_ptr(new NumericConverter); \ + *out = std::unique_ptr(new IntegerConverter); \ break; #define SIMPLE_CONVERTER_CASE(TYPE_ENUM, TYPE_CLASS) \ @@ -1105,17 +1105,17 @@ Status GetConverterFlat(const std::shared_ptr& type, bool strict_conve switch (type->id()) { SIMPLE_CONVERTER_CASE(NA, NullConverter); SIMPLE_CONVERTER_CASE(BOOL, BoolConverter); - NUMERIC_CONVERTER(INT8, Int8Type); - NUMERIC_CONVERTER(INT16, Int16Type); - NUMERIC_CONVERTER(INT32, Int32Type); - NUMERIC_CONVERTER(INT64, Int64Type); - NUMERIC_CONVERTER(UINT8, UInt8Type); - NUMERIC_CONVERTER(UINT16, UInt16Type); - NUMERIC_CONVERTER(UINT32, UInt32Type); - NUMERIC_CONVERTER(UINT64, UInt64Type); - NUMERIC_CONVERTER(HALF_FLOAT, HalfFloatType); - NUMERIC_CONVERTER(FLOAT, FloatType); - NUMERIC_CONVERTER(DOUBLE, DoubleType); + INTEGER_CONVERTER_CASE(INT8, Int8Type); + INTEGER_CONVERTER_CASE(INT16, Int16Type); + INTEGER_CONVERTER_CASE(INT32, Int32Type); + INTEGER_CONVERTER_CASE(INT64, Int64Type); + INTEGER_CONVERTER_CASE(UINT8, UInt8Type); + INTEGER_CONVERTER_CASE(UINT16, UInt16Type); + INTEGER_CONVERTER_CASE(UINT32, UInt32Type); + INTEGER_CONVERTER_CASE(UINT64, UInt64Type); + SIMPLE_CONVERTER_CASE(HALF_FLOAT, HalfFloatConverter); + SIMPLE_CONVERTER_CASE(FLOAT, FloatConverter); + SIMPLE_CONVERTER_CASE(DOUBLE, DoubleConverter); SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter); SIMPLE_CONVERTER_CASE(BINARY, BytesConverter); SIMPLE_CONVERTER_CASE(LARGE_BINARY, LargeBytesConverter); @@ -1339,5 +1339,34 @@ Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options, return ConvertPySequence(obj, nullptr, options, out); } +// Status ConvertPyValue(PyObject* value, const PyConversionOptions& options, +// std::shared_ptr* out) { +// PyAcquireGIL lock; +// OwnedRef tmp_value_nanny; +// std::shared_ptr real_type; +// std::shared_ptr scalar; +// tmp_value_nanny.reset(value); + +// // In some cases, type inference may be "loose", like strings. If the user +// // passed pa.string(), then we will error if we encounter any non-UTF8 +// // value. If not, then we will allow the result to be a BinaryArray +// bool strict_conversions = false; + +// // TODO(kszucs): add inference later +// // if (options.type == nullptr) { +// // RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type)); +// // } else { +// real_type = options.type; +// strict_conversions = true; + +// // Create the sequence converter, initialize with the builder +// std::unique_ptr converter; +// RETURN_NOT_OK( +// GetConverter(real_type, options.from_pandas, strict_conversions, &converter)); + +// // Retrieve result. Conversion may yield one or more array values +// return converter->GetScalar(value, &out); +// } + } // namespace py } // namespace arrow From fe684be774fb35bdf59996c4ff4d3306db026a68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 17 Dec 2019 02:01:35 +0100 Subject: [PATCH 2/5] User value converter --- cpp/src/arrow/python/python_to_arrow.cc | 412 ++++++++++++------------ cpp/src/arrow/type_traits.h | 12 + 2 files changed, 213 insertions(+), 211 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 98e14980bd9a7..1d82da578c56b 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -27,7 +27,9 @@ #include #include #include +#include +#include "arrow/scalar.h" #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/status.h" @@ -53,6 +55,133 @@ using internal::checked_pointer_cast; namespace py { +template +struct Value {}; + +template <> +struct Value { + using ValueType = typename BooleanType::c_type; + + static inline Result FromPython(PyObject *obj) { + if (obj == Py_True) { + return true; + } else if (obj == Py_False) { + return false; + } else { + return internal::InvalidValue(obj, "tried to convert to boolean"); + } + } +}; + +template +struct Value> { + using ValueType = typename Type::c_type; + + static inline Result FromPython(PyObject *obj) { + ValueType value; + RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); + return value; + } +}; + +template <> +struct Value { + using ValueType = typename HalfFloatType::c_type; + + static inline Result FromPython(PyObject *obj) { + ValueType value; + RETURN_NOT_OK(PyFloat_AsHalf(obj, &value)); + return value; + } +}; + +template <> +struct Value { + using ValueType = typename FloatType::c_type; + + static inline Result FromPython(PyObject *obj) { + ValueType value; + if (internal::PyFloatScalar_Check(obj)) { + value = static_cast(PyFloat_AsDouble(obj)); + RETURN_IF_PYERROR(); + } else if (internal::PyIntScalar_Check(obj)) { + RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &value)); + } else { + return internal::InvalidValue(obj, "tried to convert to float32"); + } + return value; + } +}; + +template <> +struct Value { + using ValueType = typename DoubleType::c_type; + + static inline Result FromPython(PyObject *obj) { + ValueType value; + if (PyFloat_Check(obj)) { + value = PyFloat_AS_DOUBLE(obj); + } else if (internal::PyFloatScalar_Check(obj)) { + // Other kinds of float-y things + value = PyFloat_AsDouble(obj); + RETURN_IF_PYERROR(); + } else if (internal::PyIntScalar_Check(obj)) { + RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &value)); + } else { + return internal::InvalidValue(obj, "tried to convert to double"); + } + return value; + } +}; + +template +struct Value> { + using ValueType = PyBytesView; + + static inline Result FromPython(PyObject *obj, const Type& /* unused */) { + ValueType view; + RETURN_NOT_OK(view.FromString(obj)); + return view; + } +}; + +template +struct Value> { + using ValueType = PyBytesView; + + static inline Result FromPython(PyObject *obj, const Type& type) { + ValueType view; + RETURN_NOT_OK(view.FromString(obj)); + const auto expected_length = type.byte_width(); + if (ARROW_PREDICT_FALSE(view.size != expected_length)) { + std::stringstream ss; + ss << "expected to be length " << expected_length << " was " << view.size; + return internal::InvalidValue(obj, ss.str()); + } else { + return view; + } + } +}; + +template +struct Value> { + using ValueType = PyBytesView; + + static inline Result FromPython(PyObject *obj, const Type& type) { + bool is_utf8 = false; + ValueType view; + RETURN_NOT_OK(view.FromString(obj, &is_utf8)); + // if (!is_utf8) { + // if (STRICT) { + // return internal::InvalidValue(obj, "was not a utf8 string"); + // } else { + // ++binary_count_; + // } + // } + return view; + } +}; + // ---------------------------------------------------------------------- // Sequence converter base and CRTP "middle" subclasses @@ -152,6 +281,13 @@ class TypedConverter : public SeqConverter { // Append a missing item (default implementation) Status AppendNull() { return this->typed_builder_->AppendNull(); } + // This is overridden in several subclasses, but if a Value::FromPython + // implementation is defined, it will be used here + Status AppendItem(PyObject* obj) { + ARROW_ASSIGN_OR_RAISE(auto value, Value::FromPython(obj)); + return typed_builder_->Append(value); + } + Status AppendSingle(PyObject* obj) { auto self = checked_cast(this); return CheckNull(obj) ? self->AppendNull() : self->AppendItem(obj); @@ -175,15 +311,16 @@ class TypedConverter : public SeqConverter { // Iterate over the items adding each one auto self = checked_cast(this); return internal::VisitSequenceMasked( - obj, mask, [self](PyObject* item, bool is_masked, bool* /* unused */) { - if (is_masked) { - return self->AppendNull(); - } else { - // This will also apply the null-checking convention in the event - // that the value is not masked - return self->AppendSingle(item); - } - }); + obj, mask, [self](PyObject* item, bool is_masked, bool* /* unused */) { + if (is_masked) { + return self->AppendNull(); + } else { + // This will also apply the null-checking convention in the event + // that the value is not masked + return self->AppendSingle(item); + } + } + ); } protected: @@ -207,89 +344,14 @@ class NullConverter template class BoolConverter - : public TypedConverter, null_coding> { - public: - Status AppendItem(PyObject* obj) { - if (obj == Py_True) { - return this->typed_builder_->Append(true); - } else if (obj == Py_False) { - return this->typed_builder_->Append(false); - } else { - return internal::InvalidValue(obj, "tried to convert to boolean"); - } - } -}; + : public TypedConverter, null_coding> {}; // ---------------------------------------------------------------------- -// Sequence converter template for integer types +// Sequence converter template for numeric (integer and floating point) types -template > -class IntegerConverter - : public TypedConverter, null_coding> { - public: - Status AppendItem(PyObject* obj) { - typename Type::c_type value; - RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); - return this->typed_builder_->Append(value); - } - -}; - -// ---------------------------------------------------------------------- -// Sequence converter template for floating types (float and double) - -template -class HalfFloatConverter - : public TypedConverter, null_coding> { - public: - Status AppendItem(PyObject* obj) { - npy_half val; - RETURN_NOT_OK(PyFloat_AsHalf(obj, &val)); - return this->typed_builder_->Append(val); - } -}; - -template -class FloatConverter - : public TypedConverter, null_coding> { - public: - Status AppendItem(PyObject* obj) { - if (internal::PyFloatScalar_Check(obj)) { - float val = static_cast(PyFloat_AsDouble(obj)); - RETURN_IF_PYERROR(); - return this->typed_builder_->Append(val); - } else if (internal::PyIntScalar_Check(obj)) { - float val = 0; - RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &val)); - return this->typed_builder_->Append(val); - } else { - return internal::InvalidValue(obj, "tried to convert to float32"); - } - } -}; - -template -class DoubleConverter - : public TypedConverter, null_coding> { - public: - Status AppendItem(PyObject* obj) { - if (PyFloat_Check(obj)) { - double val = PyFloat_AS_DOUBLE(obj); - return this->typed_builder_->Append(val); - } else if (internal::PyFloatScalar_Check(obj)) { - // Other kinds of float-y things - double val = PyFloat_AsDouble(obj); - RETURN_IF_PYERROR(); - return this->typed_builder_->Append(val); - } else if (internal::PyIntScalar_Check(obj)) { - double val = 0; - RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &val)); - return this->typed_builder_->Append(val); - } else { - return internal::InvalidValue(obj, "tried to convert to double"); - } - } -}; +template +class NumericConverter + : public TypedConverter, null_coding> {}; // ---------------------------------------------------------------------- // Sequence converters for temporal types @@ -504,75 +566,40 @@ class TemporalConverter // ---------------------------------------------------------------------- // Sequence converters for Binary, FixedSizeBinary, String -namespace detail { - -template -inline Status AppendPyString(BuilderType* builder, const PyBytesView& view, - bool* is_full) { - if (view.size > BuilderType::memory_limit()) { - return Status::Invalid("string too large for datatype"); - } - DCHECK_GE(view.size, 0); - // Did we reach the builder size limit? - if (ARROW_PREDICT_FALSE(builder->value_data_length() + view.size > - BuilderType::memory_limit())) { - *is_full = true; - return Status::OK(); - } - RETURN_NOT_OK(builder->Append(::arrow::util::string_view(view.bytes, view.size))); - *is_full = false; - return Status::OK(); -} - -inline Status BuilderAppend(BinaryBuilder* builder, PyObject* obj, bool* is_full) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - return AppendPyString(builder, view, is_full); -} - -inline Status BuilderAppend(LargeBinaryBuilder* builder, PyObject* obj, bool* is_full) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - return AppendPyString(builder, view, is_full); -} - -inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj, - bool* is_full) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - const auto expected_length = - checked_cast(*builder->type()).byte_width(); - if (ARROW_PREDICT_FALSE(view.size != expected_length)) { - std::stringstream ss; - ss << "expected to be length " << expected_length << " was " << view.size; - return internal::InvalidValue(obj, ss.str()); - } - - return AppendPyString(builder, view, is_full); -} - -} // namespace detail - template class BinaryLikeConverter : public TypedConverter, null_coding> { public: - Status AppendItem(PyObject* obj) { - // Accessing members of the templated base requires using this-> here - bool is_full = false; - RETURN_NOT_OK(detail::BuilderAppend(this->typed_builder_, obj, &is_full)); + using BuilderType = typename TypeTraits::BuilderType; - // Exceeded capacity of builder - if (ARROW_PREDICT_FALSE(is_full)) { + Status AppendValue(const PyBytesView& view) { + if (view.size > BuilderType::memory_limit()) { + return Status::Invalid("string too large for datatype"); + } + DCHECK_GE(view.size, 0); + + // did we reach the builder size limit? + if (ARROW_PREDICT_FALSE(this->typed_builder_->value_data_length() + view.size > + BuilderType::memory_limit())) { + // builder would be full, so need to add a new chunk std::shared_ptr chunk; RETURN_NOT_OK(this->typed_builder_->Finish(&chunk)); this->chunks_.emplace_back(std::move(chunk)); - - // Append the item now that the builder has been reset - return detail::BuilderAppend(this->typed_builder_, obj, &is_full); } + // append the value + RETURN_NOT_OK(this->typed_builder_->Append( + ::arrow::util::string_view(view.bytes, view.size))); + return Status::OK(); } + + Status AppendItem(PyObject* obj) { + // Accessing members of the templated base requires using this-> here + ARROW_ASSIGN_OR_RAISE(auto value, Value::FromPython( + obj, checked_cast(*this->typed_builder_->type()) + )); + return AppendValue(value); + } }; template @@ -582,58 +609,25 @@ template class LargeBytesConverter : public BinaryLikeConverter {}; template -class FixedWidthBytesConverter - : public BinaryLikeConverter {}; +class FixedWidthBytesConverter : public BinaryLikeConverter { + + Status AppendItem(PyObject* obj) { + std::cout << "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE\n"; + // Accessing members of the templated base requires using this-> here + ARROW_ASSIGN_OR_RAISE(auto value, Value::FromPython( + obj, checked_cast(*this->typed_builder_->type()) + )); + return AppendValue(value); + } +}; // For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, // otherwise we allow but return results as BinaryArray -template -class StringConverter - : public TypedConverter, - null_coding> { +template +class StringConverter : public BinaryLikeConverter { public: StringConverter() : binary_count_(0) {} - Status Append(PyObject* obj, bool* is_full) { - if (STRICT) { - // Force output to be unicode / utf8 and validate that any binary values - // are utf8 - bool is_utf8 = false; - RETURN_NOT_OK(string_view_.FromString(obj, &is_utf8)); - if (!is_utf8) { - return internal::InvalidValue(obj, "was not a utf8 string"); - } - } else { - // Non-strict conversion; keep track of whether values are unicode or - // bytes; if any bytes are observe, the result will be bytes - if (PyUnicode_Check(obj)) { - RETURN_NOT_OK(string_view_.FromUnicode(obj)); - } else { - // If not unicode or bytes, FromBinary will error - RETURN_NOT_OK(string_view_.FromBinary(obj)); - ++binary_count_; - } - } - - return detail::AppendPyString(this->typed_builder_, string_view_, is_full); - } - - Status AppendItem(PyObject* obj) { - bool is_full = false; - RETURN_NOT_OK(Append(obj, &is_full)); - - // Exceeded capacity of builder - if (ARROW_PREDICT_FALSE(is_full)) { - std::shared_ptr chunk; - RETURN_NOT_OK(this->typed_builder_->Finish(&chunk)); - this->chunks_.emplace_back(std::move(chunk)); - - // Append the item now that the builder has been reset - RETURN_NOT_OK(Append(obj, &is_full)); - } - return Status::OK(); - } - virtual Status GetResult(std::shared_ptr* out) { RETURN_NOT_OK(SeqConverter::GetResult(out)); @@ -643,17 +637,13 @@ class StringConverter DCHECK(!STRICT); auto binary_type = - TypeTraits::type_singleton(); + TypeTraits::type_singleton(); return (*out)->View(binary_type, out); } return Status::OK(); } private: - // Create a single instance of PyBytesView here to prevent unnecessary object - // creation/destruction - PyBytesView string_view_; - int64_t binary_count_; }; @@ -1088,9 +1078,9 @@ class DecimalConverter std::shared_ptr decimal_type_; }; -#define INTEGER_CONVERTER_CASE(TYPE_ENUM, TYPE) \ +#define NUMERIC_CONVERTER_CASE(TYPE_ENUM, TYPE) \ case Type::TYPE_ENUM: \ - *out = std::unique_ptr(new IntegerConverter); \ + *out = std::unique_ptr(new NumericConverter); \ break; #define SIMPLE_CONVERTER_CASE(TYPE_ENUM, TYPE_CLASS) \ @@ -1105,17 +1095,17 @@ Status GetConverterFlat(const std::shared_ptr& type, bool strict_conve switch (type->id()) { SIMPLE_CONVERTER_CASE(NA, NullConverter); SIMPLE_CONVERTER_CASE(BOOL, BoolConverter); - INTEGER_CONVERTER_CASE(INT8, Int8Type); - INTEGER_CONVERTER_CASE(INT16, Int16Type); - INTEGER_CONVERTER_CASE(INT32, Int32Type); - INTEGER_CONVERTER_CASE(INT64, Int64Type); - INTEGER_CONVERTER_CASE(UINT8, UInt8Type); - INTEGER_CONVERTER_CASE(UINT16, UInt16Type); - INTEGER_CONVERTER_CASE(UINT32, UInt32Type); - INTEGER_CONVERTER_CASE(UINT64, UInt64Type); - SIMPLE_CONVERTER_CASE(HALF_FLOAT, HalfFloatConverter); - SIMPLE_CONVERTER_CASE(FLOAT, FloatConverter); - SIMPLE_CONVERTER_CASE(DOUBLE, DoubleConverter); + NUMERIC_CONVERTER_CASE(INT8, Int8Type); + NUMERIC_CONVERTER_CASE(INT16, Int16Type); + NUMERIC_CONVERTER_CASE(INT32, Int32Type); + NUMERIC_CONVERTER_CASE(INT64, Int64Type); + NUMERIC_CONVERTER_CASE(UINT8, UInt8Type); + NUMERIC_CONVERTER_CASE(UINT16, UInt16Type); + NUMERIC_CONVERTER_CASE(UINT32, UInt32Type); + NUMERIC_CONVERTER_CASE(UINT64, UInt64Type); + NUMERIC_CONVERTER_CASE(HALF_FLOAT, HalfFloatType); + NUMERIC_CONVERTER_CASE(FLOAT, FloatType); + NUMERIC_CONVERTER_CASE(DOUBLE, DoubleType); SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter); SIMPLE_CONVERTER_CASE(BINARY, BytesConverter); SIMPLE_CONVERTER_CASE(LARGE_BINARY, LargeBytesConverter); diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 24b58b155d3ed..6636b346988cb 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -452,6 +452,18 @@ using is_half_float_type = std::is_same; template using enable_if_half_float = enable_if_t::value, R>; +// template +// using is_float_type = std::is_same; + +// template +// using enable_if_float = enable_if_t::value, R>; + +// template +// using is_double_type = std::is_same; + +// template +// using enable_if_double = enable_if_t::value, R>; + // Binary Types // Base binary refers to Binary/LargeBinary/String/LargeString From 81fdda6c9f07a741bcced60226791603d100f9a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 17 Dec 2019 02:30:03 +0100 Subject: [PATCH 3/5] String types --- cpp/src/arrow/python/python_to_arrow.cc | 51 +++++++++++++------------ 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 1d82da578c56b..7bd8ce46ed85d 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -138,7 +138,7 @@ template struct Value> { using ValueType = PyBytesView; - static inline Result FromPython(PyObject *obj, const Type& /* unused */) { + static inline Result FromPython(PyObject *obj) { ValueType view; RETURN_NOT_OK(view.FromString(obj)); return view; @@ -167,17 +167,9 @@ template struct Value> { using ValueType = PyBytesView; - static inline Result FromPython(PyObject *obj, const Type& type) { - bool is_utf8 = false; + static inline Result FromPython(PyObject *obj, bool* is_utf8) { ValueType view; - RETURN_NOT_OK(view.FromString(obj, &is_utf8)); - // if (!is_utf8) { - // if (STRICT) { - // return internal::InvalidValue(obj, "was not a utf8 string"); - // } else { - // ++binary_count_; - // } - // } + RETURN_NOT_OK(view.FromString(obj, is_utf8)); return view; } }; @@ -566,9 +558,8 @@ class TemporalConverter // ---------------------------------------------------------------------- // Sequence converters for Binary, FixedSizeBinary, String -template -class BinaryLikeConverter - : public TypedConverter, null_coding> { +template +class BinaryLikeConverter : public TypedConverter { public: using BuilderType = typename TypeTraits::BuilderType; @@ -595,39 +586,49 @@ class BinaryLikeConverter Status AppendItem(PyObject* obj) { // Accessing members of the templated base requires using this-> here - ARROW_ASSIGN_OR_RAISE(auto value, Value::FromPython( - obj, checked_cast(*this->typed_builder_->type()) - )); + ARROW_ASSIGN_OR_RAISE(auto value, Value::FromPython(obj)); return AppendValue(value); } }; template -class BytesConverter : public BinaryLikeConverter {}; +class BytesConverter : public BinaryLikeConverter, null_coding> {}; template -class LargeBytesConverter : public BinaryLikeConverter {}; +class LargeBytesConverter : public BinaryLikeConverter, null_coding> {}; template -class FixedWidthBytesConverter : public BinaryLikeConverter { - +class FixedWidthBytesConverter : public BinaryLikeConverter, null_coding> { + public: Status AppendItem(PyObject* obj) { - std::cout << "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE\n"; // Accessing members of the templated base requires using this-> here ARROW_ASSIGN_OR_RAISE(auto value, Value::FromPython( - obj, checked_cast(*this->typed_builder_->type()) + obj, checked_cast(*this->typed_builder_->type()) )); - return AppendValue(value); + return this->AppendValue(value); } }; // For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, // otherwise we allow but return results as BinaryArray template -class StringConverter : public BinaryLikeConverter { +class StringConverter : public BinaryLikeConverter, null_coding> { public: StringConverter() : binary_count_(0) {} + Status AppendItem(PyObject* obj) { + bool is_utf8; + ARROW_ASSIGN_OR_RAISE(auto value, Value::FromPython(obj, &is_utf8)); + if (!is_utf8) { + if (STRICT) { + return internal::InvalidValue(obj, "was not a utf8 string"); + } else { + ++binary_count_; + } + } + return this->AppendValue(value); + } + virtual Status GetResult(std::shared_ptr* out) { RETURN_NOT_OK(SeqConverter::GetResult(out)); From 1bb27985d19a6b241d3f929a33e38a8c2499db1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 17 Dec 2019 16:59:30 +0100 Subject: [PATCH 4/5] Time converters --- cpp/src/arrow/python/python_to_arrow.cc | 232 +++++++++++++----------- cpp/src/arrow/type_traits.h | 11 +- 2 files changed, 138 insertions(+), 105 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 7bd8ce46ed85d..072dd162c9e91 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -55,11 +55,32 @@ using internal::checked_pointer_cast; namespace py { +// ---------------------------------------------------------------------- +// NullCoding + +enum class NullCoding : char { NONE_ONLY, PANDAS_SENTINELS }; + +template +struct NullChecker {}; + +template <> +struct NullChecker { + static inline bool Check(PyObject* obj) { return obj == Py_None; } +}; + +template <> +struct NullChecker { + static inline bool Check(PyObject* obj) { return internal::PandasObjectIsNull(obj); } +}; + +// ---------------------------------------------------------------------- +// ValueConverter + template -struct Value {}; +struct ValueConverter {}; template <> -struct Value { +struct ValueConverter { using ValueType = typename BooleanType::c_type; static inline Result FromPython(PyObject *obj) { @@ -74,7 +95,7 @@ struct Value { }; template -struct Value> { +struct ValueConverter> { using ValueType = typename Type::c_type; static inline Result FromPython(PyObject *obj) { @@ -85,7 +106,7 @@ struct Value> { }; template <> -struct Value { +struct ValueConverter { using ValueType = typename HalfFloatType::c_type; static inline Result FromPython(PyObject *obj) { @@ -96,7 +117,7 @@ struct Value { }; template <> -struct Value { +struct ValueConverter { using ValueType = typename FloatType::c_type; static inline Result FromPython(PyObject *obj) { @@ -114,7 +135,7 @@ struct Value { }; template <> -struct Value { +struct ValueConverter { using ValueType = typename DoubleType::c_type; static inline Result FromPython(PyObject *obj) { @@ -134,8 +155,95 @@ struct Value { } }; +template <> +struct ValueConverter { + using ValueType = typename Date32Type::c_type; + + static inline Result FromPython(PyObject *obj) { + ValueType value; + if (PyDate_Check(obj)) { + auto pydate = reinterpret_cast(obj); + value = static_cast(internal::PyDate_to_days(pydate)); + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for date32")); + } + return value; + } +}; + +template<> +struct ValueConverter { + using ValueType = typename Date64Type::c_type; + + static inline Result FromPython(PyObject *obj) { + ValueType value; + if (PyDateTime_Check(obj)) { + auto pydate = reinterpret_cast(obj); + value = internal::PyDateTime_to_ms(pydate); + // Truncate any intraday milliseconds + value -= value % 86400000LL; + } else if (PyDate_Check(obj)) { + auto pydate = reinterpret_cast(obj); + value = internal::PyDate_to_ms(pydate); + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for date64")); + } + return value; + } +}; + +template<> +struct ValueConverter { + using ValueType = typename Time32Type::c_type; + + static inline Result FromPython(PyObject *obj, TimeUnit::type unit) { + ValueType value; + if (PyTime_Check(obj)) { + // datetime.time stores microsecond resolution + switch (unit) { + case TimeUnit::SECOND: + value = static_cast(internal::PyTime_to_s(obj)); + break; + case TimeUnit::MILLI: + value = static_cast(internal::PyTime_to_ms(obj)); + break; + default: + return Status::UnknownError("Invalid time unit"); + } + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32")); + } + return value; + } +}; + +template<> +struct ValueConverter { + using ValueType = typename Time64Type::c_type; + + static inline Result FromPython(PyObject *obj, TimeUnit::type unit) { + ValueType value; + if (PyTime_Check(obj)) { + // datetime.time stores microsecond resolution + switch (unit) { + case TimeUnit::MICRO: + value = internal::PyTime_to_us(obj); + break; + case TimeUnit::NANO: + value = internal::PyTime_to_ns(obj); + break; + default: + return Status::UnknownError("Invalid time unit"); + } + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64")); + } + return value; + } +}; + template -struct Value> { +struct ValueConverter> { using ValueType = PyBytesView; static inline Result FromPython(PyObject *obj) { @@ -146,7 +254,7 @@ struct Value> { }; template -struct Value> { +struct ValueConverter> { using ValueType = PyBytesView; static inline Result FromPython(PyObject *obj, const Type& type) { @@ -164,7 +272,7 @@ struct Value> { }; template -struct Value> { +struct ValueConverter> { using ValueType = PyBytesView; static inline Result FromPython(PyObject *obj, bool* is_utf8) { @@ -239,21 +347,6 @@ class SeqConverter { std::vector> chunks_; }; -enum class NullCoding : char { NONE_ONLY, PANDAS_SENTINELS }; - -template -struct NullChecker {}; - -template <> -struct NullChecker { - static inline bool Check(PyObject* obj) { return obj == Py_None; } -}; - -template <> -struct NullChecker { - static inline bool Check(PyObject* obj) { return internal::PandasObjectIsNull(obj); } -}; - // We use CRTP to avoid virtual calls to the AppendItem(), AppendNull(), and // IsNull() on the hot path template @@ -276,7 +369,7 @@ class TypedConverter : public SeqConverter { // This is overridden in several subclasses, but if a Value::FromPython // implementation is defined, it will be used here Status AppendItem(PyObject* obj) { - ARROW_ASSIGN_OR_RAISE(auto value, Value::FromPython(obj)); + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj)); return typed_builder_->Append(value); } @@ -350,98 +443,32 @@ class NumericConverter template class Date32Converter - : public TypedConverter, null_coding> { - public: - Status AppendItem(PyObject* obj) { - int32_t t; - if (PyDate_Check(obj)) { - auto pydate = reinterpret_cast(obj); - t = static_cast(internal::PyDate_to_days(pydate)); - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for date32")); - } - return this->typed_builder_->Append(t); - } -}; + : public TypedConverter, null_coding> {}; template class Date64Converter - : public TypedConverter, null_coding> { - public: - Status AppendItem(PyObject* obj) { - int64_t t; - if (PyDateTime_Check(obj)) { - auto pydate = reinterpret_cast(obj); - t = internal::PyDateTime_to_ms(pydate); - // Truncate any intraday milliseconds - t -= t % 86400000LL; - } else if (PyDate_Check(obj)) { - auto pydate = reinterpret_cast(obj); - t = internal::PyDate_to_ms(pydate); - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for date64")); - } - return this->typed_builder_->Append(t); - } -}; + : public TypedConverter, null_coding> {}; template -class Time32Converter - : public TypedConverter, null_coding> { +class Time32Converter : public TypedConverter, null_coding> { public: explicit Time32Converter(TimeUnit::type unit) : unit_(unit) {} - Status AppendItem(PyObject* obj) { - // TODO(kszucs): option for strict conversion? - int32_t t; - if (PyTime_Check(obj)) { - // datetime.time stores microsecond resolution - switch (unit_) { - case TimeUnit::SECOND: - t = static_cast(internal::PyTime_to_s(obj)); - break; - case TimeUnit::MILLI: - t = static_cast(internal::PyTime_to_ms(obj)); - break; - default: - return Status::UnknownError("Invalid time unit"); - } - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for int32")); - } - return this->typed_builder_->Append(t); + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj, unit_)); + return this->typed_builder_->Append(value); } - private: TimeUnit::type unit_; }; template -class Time64Converter - : public TypedConverter, null_coding> { +class Time64Converter : public TypedConverter, null_coding> { public: explicit Time64Converter(TimeUnit::type unit) : unit_(unit) {} - Status AppendItem(PyObject* obj) { - int64_t t; - if (PyTime_Check(obj)) { - // datetime.time stores microsecond resolution - switch (unit_) { - case TimeUnit::MICRO: - t = internal::PyTime_to_us(obj); - break; - case TimeUnit::NANO: - t = internal::PyTime_to_ns(obj); - break; - default: - return Status::UnknownError("Invalid time unit"); - } - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for int64")); - } - return this->typed_builder_->Append(t); + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj, unit_)); + return this->typed_builder_->Append(value); } - private: TimeUnit::type unit_; }; @@ -585,8 +612,7 @@ class BinaryLikeConverter : public TypedConverter { } Status AppendItem(PyObject* obj) { - // Accessing members of the templated base requires using this-> here - ARROW_ASSIGN_OR_RAISE(auto value, Value::FromPython(obj)); + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj)); return AppendValue(value); } }; @@ -602,7 +628,7 @@ class FixedWidthBytesConverter : public BinaryLikeConverter here - ARROW_ASSIGN_OR_RAISE(auto value, Value::FromPython( + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython( obj, checked_cast(*this->typed_builder_->type()) )); return this->AppendValue(value); @@ -618,7 +644,7 @@ class StringConverter : public BinaryLikeConverter::FromPython(obj, &is_utf8)); + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj, &is_utf8)); if (!is_utf8) { if (STRICT) { return internal::InvalidValue(obj, "was not a utf8 string"); diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 6636b346988cb..c465eaa0f7691 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -619,11 +619,18 @@ template using enable_if_8bit_int = enable_if_t::value, R>; template -using is_paramater_free_type = +using is_parameter_free_type = std::integral_constant::is_parameter_free>; template -using enable_if_parameter_free = enable_if_t::value, R>; +using enable_if_parameter_free = enable_if_t::value, R>; + +template +using is_parametric_type = + std::integral_constant::value>; + +template +using enable_if_parametric = enable_if_t::value, R>; // Physical representation quirks From 08a8028fb953e9f03ea8f35c70997b73bb08eb78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 6 Jan 2020 18:49:21 +0100 Subject: [PATCH 5/5] Specialize implementations using methods instead --- cpp/src/arrow/python/python_to_arrow.cc | 322 +++++++----------------- 1 file changed, 93 insertions(+), 229 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 072dd162c9e91..aae855ea87b46 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -76,14 +76,9 @@ struct NullChecker { // ---------------------------------------------------------------------- // ValueConverter -template -struct ValueConverter {}; +struct ValueConverter { -template <> -struct ValueConverter { - using ValueType = typename BooleanType::c_type; - - static inline Result FromPython(PyObject *obj) { + static inline Result FromPython(const BooleanType& type, PyObject *obj) { if (obj == Py_True) { return true; } else if (obj == Py_False) { @@ -92,38 +87,24 @@ struct ValueConverter { return internal::InvalidValue(obj, "tried to convert to boolean"); } } -}; -template -struct ValueConverter> { - using ValueType = typename Type::c_type; - - static inline Result FromPython(PyObject *obj) { - ValueType value; + template> + static inline Result FromPython(const Type& type, PyObject* obj) { + typename Type::c_type value; RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); return value; } -}; - -template <> -struct ValueConverter { - using ValueType = typename HalfFloatType::c_type; - static inline Result FromPython(PyObject *obj) { - ValueType value; + static inline Result FromPython(const HalfFloatType& type, PyObject *obj) { + HalfFloatType::c_type value; RETURN_NOT_OK(PyFloat_AsHalf(obj, &value)); return value; } -}; - -template <> -struct ValueConverter { - using ValueType = typename FloatType::c_type; - static inline Result FromPython(PyObject *obj) { - ValueType value; + static inline Result FromPython(const FloatType& type, PyObject *obj) { + float value; if (internal::PyFloatScalar_Check(obj)) { - value = static_cast(PyFloat_AsDouble(obj)); + value = static_cast(PyFloat_AsDouble(obj)); RETURN_IF_PYERROR(); } else if (internal::PyIntScalar_Check(obj)) { RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &value)); @@ -132,14 +113,9 @@ struct ValueConverter { } return value; } -}; - -template <> -struct ValueConverter { - using ValueType = typename DoubleType::c_type; - static inline Result FromPython(PyObject *obj) { - ValueType value; + static inline Result FromPython(const DoubleType& type, PyObject *obj) { + double value; if (PyFloat_Check(obj)) { value = PyFloat_AS_DOUBLE(obj); } else if (internal::PyFloatScalar_Check(obj)) { @@ -153,30 +129,26 @@ struct ValueConverter { } return value; } -}; -template <> -struct ValueConverter { - using ValueType = typename Date32Type::c_type; + static inline Result FromPython(const DecimalType& type, PyObject *obj) { + Decimal128 value; + RETURN_NOT_OK(internal::DecimalFromPyObject(obj, type, &value)); + return value; + } - static inline Result FromPython(PyObject *obj) { - ValueType value; + static inline Result FromPython(const Date32Type& type, PyObject *obj) { + int32_t value; if (PyDate_Check(obj)) { auto pydate = reinterpret_cast(obj); - value = static_cast(internal::PyDate_to_days(pydate)); + value = static_cast(internal::PyDate_to_days(pydate)); } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for date32")); } return value; } -}; - -template<> -struct ValueConverter { - using ValueType = typename Date64Type::c_type; - static inline Result FromPython(PyObject *obj) { - ValueType value; + static inline Result FromPython(const Date64Type& type, PyObject *obj) { + int64_t value; if (PyDateTime_Check(obj)) { auto pydate = reinterpret_cast(obj); value = internal::PyDateTime_to_ms(pydate); @@ -190,17 +162,12 @@ struct ValueConverter { } return value; } -}; -template<> -struct ValueConverter { - using ValueType = typename Time32Type::c_type; - - static inline Result FromPython(PyObject *obj, TimeUnit::type unit) { - ValueType value; + static inline Result FromPython(const Time32Type& type, PyObject *obj) { + int32_t value; if (PyTime_Check(obj)) { // datetime.time stores microsecond resolution - switch (unit) { + switch (type.unit()) { case TimeUnit::SECOND: value = static_cast(internal::PyTime_to_s(obj)); break; @@ -215,17 +182,12 @@ struct ValueConverter { } return value; } -}; - -template<> -struct ValueConverter { - using ValueType = typename Time64Type::c_type; - static inline Result FromPython(PyObject *obj, TimeUnit::type unit) { - ValueType value; + static inline Result FromPython(const Time64Type& type, PyObject *obj) { + int64_t value; if (PyTime_Check(obj)) { // datetime.time stores microsecond resolution - switch (unit) { + switch (type.unit()) { case TimeUnit::MICRO: value = internal::PyTime_to_us(obj); break; @@ -240,25 +202,16 @@ struct ValueConverter { } return value; } -}; - -template -struct ValueConverter> { - using ValueType = PyBytesView; - static inline Result FromPython(PyObject *obj) { - ValueType view; + template> + static inline Result FromPython(const Type& type, PyObject *obj) { + PyBytesView view; RETURN_NOT_OK(view.FromString(obj)); return view; } -}; -template -struct ValueConverter> { - using ValueType = PyBytesView; - - static inline Result FromPython(PyObject *obj, const Type& type) { - ValueType view; + static inline Result FromPython(const FixedSizeBinaryType& type, PyObject *obj) { + PyBytesView view; RETURN_NOT_OK(view.FromString(obj)); const auto expected_length = type.byte_width(); if (ARROW_PREDICT_FALSE(view.size != expected_length)) { @@ -269,15 +222,11 @@ struct ValueConverter> { return view; } } -}; - -template -struct ValueConverter> { - using ValueType = PyBytesView; - static inline Result FromPython(PyObject *obj, bool* is_utf8) { - ValueType view; - RETURN_NOT_OK(view.FromString(obj, is_utf8)); + template> + static inline Result FromPython(const Type& type, PyObject *obj) { //, bool* is_utf8 + PyBytesView view; + RETURN_NOT_OK(view.FromString(obj));//, is_utf8)); return view; } }; @@ -354,10 +303,13 @@ class TypedConverter : public SeqConverter { public: using BuilderType = typename TypeTraits::BuilderType; + explicit TypedConverter(Type type) : data_type_(type) {} + Status Init(ArrayBuilder* builder) override { builder_ = builder; DCHECK_NE(builder_, nullptr); typed_builder_ = checked_cast(builder); + data_type_ = checked_cast(*typed_builder_->type()); return Status::OK(); } @@ -369,7 +321,7 @@ class TypedConverter : public SeqConverter { // This is overridden in several subclasses, but if a Value::FromPython // implementation is defined, it will be used here Status AppendItem(PyObject* obj) { - ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj)); + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(data_type_, obj)); return typed_builder_->Append(value); } @@ -409,6 +361,7 @@ class TypedConverter : public SeqConverter { } protected: + Type data_type_; BuilderType* typed_builder_; }; @@ -425,53 +378,10 @@ class NullConverter }; // ---------------------------------------------------------------------- -// Sequence converter for boolean type - -template -class BoolConverter - : public TypedConverter, null_coding> {}; - -// ---------------------------------------------------------------------- -// Sequence converter template for numeric (integer and floating point) types +// Sequence converter for primitive types template -class NumericConverter - : public TypedConverter, null_coding> {}; - -// ---------------------------------------------------------------------- -// Sequence converters for temporal types - -template -class Date32Converter - : public TypedConverter, null_coding> {}; - -template -class Date64Converter - : public TypedConverter, null_coding> {}; - -template -class Time32Converter : public TypedConverter, null_coding> { - public: - explicit Time32Converter(TimeUnit::type unit) : unit_(unit) {} - Status AppendItem(PyObject* obj) { - ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj, unit_)); - return this->typed_builder_->Append(value); - } - private: - TimeUnit::type unit_; -}; - -template -class Time64Converter : public TypedConverter, null_coding> { - public: - explicit Time64Converter(TimeUnit::type unit) : unit_(unit) {} - Status AppendItem(PyObject* obj) { - ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj, unit_)); - return this->typed_builder_->Append(value); - } - private: - TimeUnit::type unit_; -}; +class PrimitiveConverter : public TypedConverter, null_coding> {}; template struct PyDateTimeTraits {}; @@ -586,7 +496,7 @@ class TemporalConverter // Sequence converters for Binary, FixedSizeBinary, String template -class BinaryLikeConverter : public TypedConverter { +class BinaryBaseConverter : public TypedConverter { public: using BuilderType = typename TypeTraits::BuilderType; @@ -612,39 +522,26 @@ class BinaryLikeConverter : public TypedConverter { } Status AppendItem(PyObject* obj) { - ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj)); + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(this->data_type_, obj)); return AppendValue(value); } }; -template -class BytesConverter : public BinaryLikeConverter, null_coding> {}; - -template -class LargeBytesConverter : public BinaryLikeConverter, null_coding> {}; - -template -class FixedWidthBytesConverter : public BinaryLikeConverter, null_coding> { - public: - Status AppendItem(PyObject* obj) { - // Accessing members of the templated base requires using this-> here - ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython( - obj, checked_cast(*this->typed_builder_->type()) - )); - return this->AppendValue(value); - } -}; +template +class BinaryConverter + : public BinaryBaseConverter, null_coding> {}; // For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, // otherwise we allow but return results as BinaryArray template -class StringConverter : public BinaryLikeConverter, null_coding> { +class StringConverter + : public BinaryBaseConverter, null_coding> { public: StringConverter() : binary_count_(0) {} Status AppendItem(PyObject* obj) { - bool is_utf8; - ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj, &is_utf8)); + bool is_utf8 = true; + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(this->data_type_, obj)); // &is_utf8 if (!is_utf8) { if (STRICT) { return internal::InvalidValue(obj, "was not a utf8 string"); @@ -657,12 +554,10 @@ class StringConverter : public BinaryLikeConverter* out) { RETURN_NOT_OK(SeqConverter::GetResult(out)); - // If we saw any non-unicode, cast results to BinaryArray if (binary_count_) { // We should have bailed out earlier DCHECK(!STRICT); - auto binary_type = TypeTraits::type_singleton(); return (*out)->View(binary_type, out); @@ -1081,38 +976,9 @@ class StructConverter bool strict_conversions_; }; -template -class DecimalConverter - : public TypedConverter, - null_coding> { - public: - using BASE = - TypedConverter, null_coding>; - - Status Init(ArrayBuilder* builder) override { - RETURN_NOT_OK(BASE::Init(builder)); - decimal_type_ = checked_pointer_cast(this->typed_builder_->type()); - return Status::OK(); - } - - Status AppendItem(PyObject* obj) { - Decimal128 value; - RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *decimal_type_, &value)); - return this->typed_builder_->Append(value); - } - - private: - std::shared_ptr decimal_type_; -}; - -#define NUMERIC_CONVERTER_CASE(TYPE_ENUM, TYPE) \ - case Type::TYPE_ENUM: \ - *out = std::unique_ptr(new NumericConverter); \ - break; - -#define SIMPLE_CONVERTER_CASE(TYPE_ENUM, TYPE_CLASS) \ - case Type::TYPE_ENUM: \ - *out = std::unique_ptr(new TYPE_CLASS); \ +#define CONVERTER(TYPE_ENUM, CLASS, TYPE) \ + case Type::TYPE_ENUM: \ + *out = std::unique_ptr(new CLASS(type)); \ break; // Dynamic constructor for sequence converters @@ -1120,63 +986,61 @@ template Status GetConverterFlat(const std::shared_ptr& type, bool strict_conversions, std::unique_ptr* out) { switch (type->id()) { - SIMPLE_CONVERTER_CASE(NA, NullConverter); - SIMPLE_CONVERTER_CASE(BOOL, BoolConverter); - NUMERIC_CONVERTER_CASE(INT8, Int8Type); - NUMERIC_CONVERTER_CASE(INT16, Int16Type); - NUMERIC_CONVERTER_CASE(INT32, Int32Type); - NUMERIC_CONVERTER_CASE(INT64, Int64Type); - NUMERIC_CONVERTER_CASE(UINT8, UInt8Type); - NUMERIC_CONVERTER_CASE(UINT16, UInt16Type); - NUMERIC_CONVERTER_CASE(UINT32, UInt32Type); - NUMERIC_CONVERTER_CASE(UINT64, UInt64Type); - NUMERIC_CONVERTER_CASE(HALF_FLOAT, HalfFloatType); - NUMERIC_CONVERTER_CASE(FLOAT, FloatType); - NUMERIC_CONVERTER_CASE(DOUBLE, DoubleType); - SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter); - SIMPLE_CONVERTER_CASE(BINARY, BytesConverter); - SIMPLE_CONVERTER_CASE(LARGE_BINARY, LargeBytesConverter); - SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter); - SIMPLE_CONVERTER_CASE(DATE32, Date32Converter); - SIMPLE_CONVERTER_CASE(DATE64, Date64Converter); + case Type::NA: + *out = std::unique_ptr(new NullConverter()); + break; + CONVERTER(BOOL, PrimitiveConverter, BooleanType); + CONVERTER(INT8, PrimitiveConverter, Int8Type); + CONVERTER(INT16, PrimitiveConverter, Int16Type); + CONVERTER(INT32, PrimitiveConverter, Int32Type); + CONVERTER(INT64, PrimitiveConverter, Int64Type); + CONVERTER(UINT8, PrimitiveConverter, UInt8Type); + CONVERTER(UINT16, PrimitiveConverter, UInt16Type); + CONVERTER(UINT32, PrimitiveConverter, UInt32Type); + CONVERTER(UINT64, PrimitiveConverter, UInt64Type); + CONVERTER(HALF_FLOAT, PrimitiveConverter, HalfFloatType); + CONVERTER(FLOAT, PrimitiveConverter, FloatType); + CONVERTER(DOUBLE, PrimitiveConverter, DoubleType); + CONVERTER(DECIMAL, PrimitiveConverter, Decimal128Type); + CONVERTER(DATE32, PrimitiveConverter, Date32Type); + CONVERTER(DATE64, PrimitiveConverter, Date64Type); + CONVERTER(TIME32, PrimitiveConverter, Time32Type); + CONVERTER(TIME64, PrimitiveConverter, Time64Type); + CONVERTER(BINARY, BinaryConverter, BinaryType); + CONVERTER(LARGE_BINARY, BinaryConverter, LargeBinaryType); + CONVERTER(FIXED_SIZE_BINARY, BinaryConverter, FixedSizeBinaryType); case Type::STRING: if (strict_conversions) { *out = std::unique_ptr( - new StringConverter()); + new StringConverter(type) + ); } else { *out = std::unique_ptr( - new StringConverter()); + new StringConverter(type) + ); } break; case Type::LARGE_STRING: if (strict_conversions) { *out = std::unique_ptr( - new StringConverter()); + new StringConverter(type) + ); } else { *out = std::unique_ptr( - new StringConverter()); + new StringConverter(type) + ); } break; - case Type::TIME32: { - *out = std::unique_ptr(new Time32Converter( - checked_cast(*type).unit())); - break; - } - case Type::TIME64: { - *out = std::unique_ptr(new Time64Converter( - checked_cast(*type).unit())); - break; - } case Type::TIMESTAMP: { - *out = - std::unique_ptr(new TemporalConverter( - checked_cast(*type).unit())); + *out = std::unique_ptr( + new TemporalConverter(type) + ); break; } case Type::DURATION: { - *out = - std::unique_ptr(new TemporalConverter( - checked_cast(*type).unit())); + *out = std::unique_ptr( + new TemporalConverter(type) + ); break; } default: