diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index c30441d911e4e..e400e09656929 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -112,7 +112,7 @@ bool IsNan(const Scalar& value) { } std::optional ColumnChunkStatisticsAsExpression( - const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) { + const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata, const arrow::Field& dest_field) { // For the remaining of this function, failure to extract/parse statistics // are ignored by returning nullptr. The goal is two fold. First // avoid an optimization which breaks the computation. Second, allow the @@ -131,7 +131,7 @@ std::optional ColumnChunkStatisticsAsExpression( return std::nullopt; } - return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics); + return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics, dest_field); } void AddColumnIndices(const SchemaField& schema_field, @@ -311,7 +311,7 @@ Result IsSupportedParquetFile(const ParquetFileFormat& format, } // namespace std::optional ParquetFileFragment::EvaluateStatisticsAsExpression( - const Field& field, const parquet::Statistics& statistics) { + const Field& field, const parquet::Statistics& statistics, const Field& dest_field) { auto field_expr = compute::field_ref(field.name()); // Optimize for corner case where all values are nulls @@ -324,8 +324,8 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr return std::nullopt; } - auto maybe_min = min->CastTo(field.type()); - auto maybe_max = max->CastTo(field.type()); + auto maybe_min = min->CastTo(dest_field.type()); + auto maybe_max = max->CastTo(dest_field.type()); if (maybe_min.ok() && maybe_max.ok()) { min = maybe_min.MoveValueUnsafe(); @@ -799,12 +799,13 @@ Result> ParquetFileFragment::TestRowGroups( statistics_expressions_complete_[match[0]] = true; const SchemaField& schema_field = manifest_->schema_fields[match[0]]; + auto arrow_field = physical_schema_->field(match[0]); int i = 0; for (int row_group : *row_groups_) { auto row_group_metadata = metadata_->RowGroup(row_group); if (auto minmax = - ColumnChunkStatisticsAsExpression(schema_field, *row_group_metadata)) { + ColumnChunkStatisticsAsExpression(schema_field, *row_group_metadata, *arrow_field)) { FoldingAnd(&statistics_expressions_[i], std::move(*minmax)); ARROW_ASSIGN_OR_RAISE(statistics_expressions_[i], statistics_expressions_[i].Bind(*physical_schema_)); diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index f33190bd93347..8cf00c5c890cb 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -172,7 +172,7 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { Result> Subset(std::vector row_group_ids); static std::optional EvaluateStatisticsAsExpression( - const Field& field, const parquet::Statistics& statistics); + const Field& field, const parquet::Statistics& statistics, const Field& dest_field); private: ParquetFileFragment(FileSource source, std::shared_ptr format, diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 42f923f0e6a27..dfe4f291714fe 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -726,7 +726,7 @@ TEST(TestParquetStatistics, NullMax) { parquet::ParquetFileReader::OpenFile(dir_string + "/nan_in_stats.parquet"); auto statistics = reader->RowGroup(0)->metadata()->ColumnChunk(0)->statistics(); auto stat_expression = - ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics); + ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics, *field); EXPECT_EQ(stat_expression->ToString(), "(x >= 1)"); }