From 709639eec459d430cfa2b4922b4b243a719d4efd Mon Sep 17 00:00:00 2001 From: zhixingheyi-tian Date: Fri, 16 Sep 2022 13:19:43 +0800 Subject: [PATCH 1/2] troubleshoot for write parquet --- cpp/src/arrow/array/validate.cc | 3 +++ cpp/src/parquet/arrow/reader.cc | 6 +++--- cpp/src/parquet/column_reader.cc | 5 +++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 6ac885f8443c6..b6aa2fe69cfa0 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -18,6 +18,7 @@ #include "arrow/array/validate.h" #include +#include #include "arrow/array.h" // IWYU pragma: keep #include "arrow/extension_type.h" @@ -205,6 +206,8 @@ struct ValidateArrayImpl { } const auto data_extent = last_offset - first_offset; const auto values_length = values.size(); + std::cout << "data_extent:" << data_extent << std::endl; + std::cout << "values_length:" << values_length << std::endl; if (values_length < data_extent) { return Status::Invalid("Length spanned by binary offsets (", data_extent, ") larger than values array (size ", values_length, ")"); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 55208e503e723..38c9b18d55e05 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -106,9 +106,9 @@ class ColumnReaderImpl : public ColumnReader { std::shared_ptr<::arrow::ChunkedArray>* out) final { RETURN_NOT_OK(LoadBatch(batch_size)); RETURN_NOT_OK(BuildArray(batch_size, out)); - for (int x = 0; x < (*out)->num_chunks(); x++) { - RETURN_NOT_OK((*out)->chunk(x)->Validate()); - } + // for (int x = 0; x < (*out)->num_chunks(); x++) { + // RETURN_NOT_OK((*out)->chunk(x)->Validate()); + // } return Status::OK(); } diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index f4979d2d97f57..4b7e5eb67b39a 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1626,6 +1626,7 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader, std::shared_ptr ReleaseValues() override { auto result = values_; + std::cout << "values_->size():" << values_->size() << std::endl; // PARQUET_THROW_NOT_OK(result->Resize(bytes_for_values(values_written_), true)); values_ = AllocateBuffer(this->pool_); values_capacity_ = 0; @@ -1639,9 +1640,9 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader, const auto first_offset = offsetArr[0]; const auto last_offset = offsetArr[values_written_]; int64_t binary_length = last_offset - first_offset; - // std::cout << "binary_length:" << binary_length << std::endl; + std::cout << "binary_length:" << binary_length << std::endl; values_->SetSize(binary_length); - + offset_ = AllocateBuffer(this->pool_); bianry_length_ = 0; return result; From 53eed0c9519691ba3b9d6bda2604a7c6eb70ceb9 Mon Sep 17 00:00:00 2001 From: zhixingheyi-tian Date: Fri, 16 Sep 2022 15:50:55 +0800 Subject: [PATCH 2/2] Use capacity instead of size in Validate() --- cpp/src/arrow/array/validate.cc | 5 +---- cpp/src/parquet/arrow/reader.cc | 6 +++--- cpp/src/parquet/column_reader.cc | 3 +-- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index b6aa2fe69cfa0..cfbabfaeef0d5 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -18,7 +18,6 @@ #include "arrow/array/validate.h" #include -#include #include "arrow/array.h" // IWYU pragma: keep #include "arrow/extension_type.h" @@ -205,9 +204,7 @@ struct ValidateArrayImpl { return Status::Invalid("Negative offsets in binary array"); } const auto data_extent = last_offset - first_offset; - const auto values_length = values.size(); - std::cout << "data_extent:" << data_extent << std::endl; - std::cout << "values_length:" << values_length << std::endl; + const auto values_length = values.capacity(); if (values_length < data_extent) { return Status::Invalid("Length spanned by binary offsets (", data_extent, ") larger than values array (size ", values_length, ")"); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 38c9b18d55e05..55208e503e723 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -106,9 +106,9 @@ class ColumnReaderImpl : public ColumnReader { std::shared_ptr<::arrow::ChunkedArray>* out) final { RETURN_NOT_OK(LoadBatch(batch_size)); RETURN_NOT_OK(BuildArray(batch_size, out)); - // for (int x = 0; x < (*out)->num_chunks(); x++) { - // RETURN_NOT_OK((*out)->chunk(x)->Validate()); - // } + for (int x = 0; x < (*out)->num_chunks(); x++) { + RETURN_NOT_OK((*out)->chunk(x)->Validate()); + } return Status::OK(); } diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 4b7e5eb67b39a..56f87b74569fd 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1626,7 +1626,6 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader, std::shared_ptr ReleaseValues() override { auto result = values_; - std::cout << "values_->size():" << values_->size() << std::endl; // PARQUET_THROW_NOT_OK(result->Resize(bytes_for_values(values_written_), true)); values_ = AllocateBuffer(this->pool_); values_capacity_ = 0; @@ -1640,7 +1639,7 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader, const auto first_offset = offsetArr[0]; const auto last_offset = offsetArr[values_written_]; int64_t binary_length = last_offset - first_offset; - std::cout << "binary_length:" << binary_length << std::endl; + // std::cout << "binary_length:" << binary_length << std::endl; values_->SetSize(binary_length); offset_ = AllocateBuffer(this->pool_);