forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
apacheGH-43983: [C++][Parquet] Add support for arrow::ArrayStatistics…
…: zero-copy types (apache#43984) ### Rationale for this change Statistics is useful for fast processing. Target types: * `Int32` * `Int64` * `Float` * `Double` * `Timestamp[milli]` * `Timestamp[micro]` * `Timestamp[nano]` ### What changes are included in this PR? Map `ColumnChunkMetaData` information to `arrow::ArrayStatistics`. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: apache#43983 Authored-by: Sutou Kouhei <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
- Loading branch information
Showing
2 changed files
with
117 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -319,30 +319,20 @@ void ReconstructChunksWithoutNulls(::arrow::ArrayVector* chunks) { | |
} | ||
|
||
template <typename ArrowType, typename ParquetType> | ||
Status TransferInt(RecordReader* reader, | ||
std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, | ||
const ReaderContext* ctx, const std::shared_ptr<Field>& field, | ||
Datum* out) { | ||
void AttachStatistics(::arrow::ArrayData* data, | ||
std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, | ||
const ReaderContext* ctx) { | ||
using ArrowCType = typename ArrowType::c_type; | ||
using ParquetCType = typename ParquetType::c_type; | ||
int64_t length = reader->values_written(); | ||
ARROW_ASSIGN_OR_RAISE(auto data, | ||
::arrow::AllocateBuffer(length * sizeof(ArrowCType), ctx->pool)); | ||
|
||
auto values = reinterpret_cast<const ParquetCType*>(reader->values()); | ||
auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data()); | ||
std::copy(values, values + length, out_ptr); | ||
int64_t null_count = 0; | ||
std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, std::move(data)}; | ||
if (field->nullable()) { | ||
null_count = reader->null_count(); | ||
buffers[0] = reader->ReleaseIsValid(); | ||
auto statistics = metadata->statistics().get(); | ||
if (data->null_count == ::arrow::kUnknownNullCount && !statistics) { | ||
return; | ||
} | ||
auto array_data = | ||
::arrow::ArrayData::Make(field->type(), length, std::move(buffers), null_count); | ||
|
||
auto array_statistics = std::make_shared<::arrow::ArrayStatistics>(); | ||
array_statistics->null_count = null_count; | ||
auto statistics = metadata->statistics().get(); | ||
if (data->null_count != ::arrow::kUnknownNullCount) { | ||
array_statistics->null_count = data->null_count; | ||
} | ||
if (statistics) { | ||
if (statistics->HasDistinctCount()) { | ||
array_statistics->distinct_count = statistics->distinct_count(); | ||
|
@@ -352,31 +342,63 @@ Status TransferInt(RecordReader* reader, | |
static_cast<::parquet::TypedStatistics<ParquetType>*>(statistics); | ||
const ArrowCType min = typed_statistics->min(); | ||
const ArrowCType max = typed_statistics->max(); | ||
if (std::is_signed<ArrowCType>::value) { | ||
if (std::is_floating_point<ArrowCType>::value) { | ||
array_statistics->min = static_cast<double>(min); | ||
array_statistics->max = static_cast<double>(max); | ||
} else if (std::is_signed<ArrowCType>::value) { | ||
array_statistics->min = static_cast<int64_t>(min); | ||
array_statistics->max = static_cast<int64_t>(max); | ||
} else { | ||
array_statistics->min = static_cast<uint64_t>(min); | ||
array_statistics->max = static_cast<uint64_t>(max); | ||
} | ||
// We can assume that integer based min/max are always exact if | ||
// they exist. Apache Parquet's "Statistics" has | ||
// "is_min_value_exact" and "is_max_value_exact" but we can | ||
// ignore them for integer based min/max. | ||
// We can assume that integer and floating point number based | ||
// min/max are always exact if they exist. Apache Parquet's | ||
// "Statistics" has "is_min_value_exact" and | ||
// "is_max_value_exact" but we can ignore them for integer and | ||
// floating point number based min/max. | ||
// | ||
// See also the discussion at [email protected]: | ||
// https://lists.apache.org/thread/zfnmg5p51b7oylft5w5k4670wgkd4zv4 | ||
array_statistics->is_min_exact = true; | ||
array_statistics->is_max_exact = true; | ||
} | ||
} | ||
array_data->statistics = std::move(array_statistics); | ||
|
||
data->statistics = std::move(array_statistics); | ||
} | ||
|
||
template <typename ArrowType, typename ParquetType> | ||
Status TransferInt(RecordReader* reader, | ||
std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, | ||
const ReaderContext* ctx, const std::shared_ptr<Field>& field, | ||
Datum* out) { | ||
using ArrowCType = typename ArrowType::c_type; | ||
using ParquetCType = typename ParquetType::c_type; | ||
int64_t length = reader->values_written(); | ||
ARROW_ASSIGN_OR_RAISE(auto data, | ||
::arrow::AllocateBuffer(length * sizeof(ArrowCType), ctx->pool)); | ||
|
||
auto values = reinterpret_cast<const ParquetCType*>(reader->values()); | ||
auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data()); | ||
std::copy(values, values + length, out_ptr); | ||
int64_t null_count = 0; | ||
std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, std::move(data)}; | ||
if (field->nullable()) { | ||
null_count = reader->null_count(); | ||
buffers[0] = reader->ReleaseIsValid(); | ||
} | ||
auto array_data = | ||
::arrow::ArrayData::Make(field->type(), length, std::move(buffers), null_count); | ||
AttachStatistics<ArrowType, ParquetType>(array_data.get(), std::move(metadata), ctx); | ||
*out = std::make_shared<ArrayType<ArrowType>>(std::move(array_data)); | ||
return Status::OK(); | ||
} | ||
|
||
std::shared_ptr<Array> TransferZeroCopy(RecordReader* reader, | ||
const std::shared_ptr<Field>& field) { | ||
template <typename ArrowType, typename ParquetType> | ||
std::shared_ptr<Array> TransferZeroCopy( | ||
RecordReader* reader, std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, | ||
const ReaderContext* ctx, const std::shared_ptr<Field>& field) { | ||
std::shared_ptr<::arrow::ArrayData> data; | ||
if (field->nullable()) { | ||
std::vector<std::shared_ptr<Buffer>> buffers = {reader->ReleaseIsValid(), | ||
|
@@ -388,7 +410,8 @@ std::shared_ptr<Array> TransferZeroCopy(RecordReader* reader, | |
data = std::make_shared<::arrow::ArrayData>(field->type(), reader->values_written(), | ||
std::move(buffers), /*null_count=*/0); | ||
} | ||
return ::arrow::MakeArray(data); | ||
AttachStatistics<ArrowType, ParquetType>(data.get(), std::move(metadata), ctx); | ||
return ::arrow::MakeArray(std::move(data)); | ||
} | ||
|
||
Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum* out) { | ||
|
@@ -794,10 +817,20 @@ Status TransferColumnData(RecordReader* reader, | |
break; | ||
} | ||
case ::arrow::Type::INT32: | ||
result = TransferZeroCopy<::arrow::Int32Type, Int32Type>( | ||
reader, std::move(metadata), ctx, value_field); | ||
break; | ||
case ::arrow::Type::INT64: | ||
result = TransferZeroCopy<::arrow::Int64Type, Int64Type>( | ||
reader, std::move(metadata), ctx, value_field); | ||
break; | ||
case ::arrow::Type::FLOAT: | ||
result = TransferZeroCopy<::arrow::FloatType, FloatType>( | ||
reader, std::move(metadata), ctx, value_field); | ||
break; | ||
case ::arrow::Type::DOUBLE: | ||
result = TransferZeroCopy(reader, value_field); | ||
result = TransferZeroCopy<::arrow::DoubleType, DoubleType>( | ||
reader, std::move(metadata), ctx, value_field); | ||
break; | ||
case ::arrow::Type::BOOL: | ||
RETURN_NOT_OK(TransferBool(reader, value_field->nullable(), pool, &result)); | ||
|
@@ -895,7 +928,8 @@ Status TransferColumnData(RecordReader* reader, | |
case ::arrow::TimeUnit::MILLI: | ||
case ::arrow::TimeUnit::MICRO: | ||
case ::arrow::TimeUnit::NANO: | ||
result = TransferZeroCopy(reader, value_field); | ||
result = TransferZeroCopy<::arrow::Int64Type, Int64Type>( | ||
reader, std::move(metadata), ctx, value_field); | ||
break; | ||
default: | ||
return Status::NotImplemented("TimeUnit not supported"); | ||
|