diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 1f8b6cc4882cf..ca391b4354c07 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -366,8 +366,12 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr const parquet::Statistics& statistics) { auto field_expr = compute::field_ref(field_ref); + bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0; // Optimize for corner case where all values are nulls - if (statistics.num_values() == 0 && statistics.null_count() > 0) { + if (statistics.num_values() == 0) { + // If there are no non-null values, column `field_ref` in the fragment + // might be empty or all values are nulls. In this case, we also return + // a null expression. return is_null(std::move(field_expr)); } @@ -378,7 +382,6 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr auto maybe_min = Cast(min, field.type()); auto maybe_max = Cast(max, field.type()); - if (maybe_min.ok() && maybe_max.ok()) { min = maybe_min.MoveValueUnsafe().scalar(); max = maybe_max.MoveValueUnsafe().scalar(); @@ -386,7 +389,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr if (min->Equals(*max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); - if (statistics.null_count() == 0) { + if (!may_have_null) { return single_value; } return compute::or_(std::move(single_value), is_null(std::move(field_expr))); @@ -412,9 +415,8 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr } else { in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); } - - if (statistics.null_count() != 0) { - return compute::or_(std::move(in_range), compute::is_null(field_expr)); + if (may_have_null) { + return compute::or_(std::move(in_range), compute::is_null(std::move(field_expr))); } return in_range; } @@ -423,7 +425,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr std::optional ParquetFileFragment::EvaluateStatisticsAsExpression( const Field& field, const parquet::Statistics& statistics) { - const auto field_name = field.name(); + auto field_name = field.name(); return EvaluateStatisticsAsExpression(field, FieldRef(std::move(field_name)), statistics); } diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index bf626826d4d1b..2c05dcd9be459 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -841,6 +841,56 @@ TEST(TestParquetStatistics, NullMax) { EXPECT_EQ(stat_expression->ToString(), "(x >= 1)"); } +TEST(TestParquetStatistics, NoNullCount) { + auto field = ::arrow::field("x", int32()); + auto parquet_node_ptr = ::parquet::schema::Int32("x", ::parquet::Repetition::REQUIRED); + ::parquet::ColumnDescriptor descr(parquet_node_ptr, /*max_definition_level=*/1, + /*max_repetition_level=*/0); + + auto int32_to_parquet_stats = [](int32_t v) { + std::string value; + value.resize(sizeof(int32_t)); + memcpy(value.data(), &v, sizeof(int32_t)); + return value; + }; + { + // Base case: when null_count is not set, the expression might contain null + ::parquet::EncodedStatistics encoded_stats; + encoded_stats.set_min(int32_to_parquet_stats(1)); + encoded_stats.set_max(int32_to_parquet_stats(100)); + encoded_stats.has_null_count = false; + encoded_stats.all_null_value = false; + encoded_stats.null_count = 0; + auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10); + + auto stat_expression = + ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); + ASSERT_TRUE(stat_expression.has_value()); + EXPECT_EQ(stat_expression->ToString(), + "(((x >= 1) and (x <= 100)) or is_null(x, {nan_is_null=false}))"); + } + { + // Special case: when num_value is 0, it would return + // "is_null". + ::parquet::EncodedStatistics encoded_stats; + encoded_stats.has_null_count = true; + encoded_stats.null_count = 1; + encoded_stats.all_null_value = true; + auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0); + auto stat_expression = + ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); + ASSERT_TRUE(stat_expression.has_value()); + EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})"); + + encoded_stats.has_null_count = false; + encoded_stats.all_null_value = false; + stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0); + stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); + ASSERT_TRUE(stat_expression.has_value()); + EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})"); + } +} + class DelayedBufferReader : public ::arrow::io::BufferReader { public: explicit DelayedBufferReader(const std::shared_ptr<::arrow::Buffer>& buffer)