Skip to content

Commit

Permalink
apacheGH-43712: [C++][Parquet] Dataset: Handle num-nulls in Parquet c…
Browse files Browse the repository at this point in the history
…orrectly when !HasNullCount() (apache#43726)

### Rationale for this change

See issue. When `!HasNullCount`, we cannot gurantee null exists

### What changes are included in this PR?

Handle HasNullCount in dataset expr

### Are these changes tested?

Yes

### Are there any user-facing changes?

Merely

* GitHub Issue: apache#43712

Lead-authored-by: mwish <[email protected]>
Co-authored-by: mwish <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: mwish <[email protected]>
  • Loading branch information
3 people authored Sep 6, 2024
1 parent 089ad78 commit ab0a40e
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 7 deletions.
16 changes: 9 additions & 7 deletions cpp/src/arrow/dataset/file_parquet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,12 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
const parquet::Statistics& statistics) {
auto field_expr = compute::field_ref(field_ref);

bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0;
// Optimize for corner case where all values are nulls
if (statistics.num_values() == 0 && statistics.null_count() > 0) {
if (statistics.num_values() == 0) {
// If there are no non-null values, column `field_ref` in the fragment
// might be empty or all values are nulls. In this case, we also return
// a null expression.
return is_null(std::move(field_expr));
}

Expand All @@ -378,15 +382,14 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr

auto maybe_min = Cast(min, field.type());
auto maybe_max = Cast(max, field.type());

if (maybe_min.ok() && maybe_max.ok()) {
min = maybe_min.MoveValueUnsafe().scalar();
max = maybe_max.MoveValueUnsafe().scalar();

if (min->Equals(*max)) {
auto single_value = compute::equal(field_expr, compute::literal(std::move(min)));

if (statistics.null_count() == 0) {
if (!may_have_null) {
return single_value;
}
return compute::or_(std::move(single_value), is_null(std::move(field_expr)));
Expand All @@ -412,9 +415,8 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
} else {
in_range = compute::and_(std::move(lower_bound), std::move(upper_bound));
}

if (statistics.null_count() != 0) {
return compute::or_(std::move(in_range), compute::is_null(field_expr));
if (may_have_null) {
return compute::or_(std::move(in_range), compute::is_null(std::move(field_expr)));
}
return in_range;
}
Expand All @@ -423,7 +425,7 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr

std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpression(
const Field& field, const parquet::Statistics& statistics) {
const auto field_name = field.name();
auto field_name = field.name();
return EvaluateStatisticsAsExpression(field, FieldRef(std::move(field_name)),
statistics);
}
Expand Down
50 changes: 50 additions & 0 deletions cpp/src/arrow/dataset/file_parquet_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,56 @@ TEST(TestParquetStatistics, NullMax) {
EXPECT_EQ(stat_expression->ToString(), "(x >= 1)");
}

TEST(TestParquetStatistics, NoNullCount) {
auto field = ::arrow::field("x", int32());
auto parquet_node_ptr = ::parquet::schema::Int32("x", ::parquet::Repetition::REQUIRED);
::parquet::ColumnDescriptor descr(parquet_node_ptr, /*max_definition_level=*/1,
/*max_repetition_level=*/0);

auto int32_to_parquet_stats = [](int32_t v) {
std::string value;
value.resize(sizeof(int32_t));
memcpy(value.data(), &v, sizeof(int32_t));
return value;
};
{
// Base case: when null_count is not set, the expression might contain null
::parquet::EncodedStatistics encoded_stats;
encoded_stats.set_min(int32_to_parquet_stats(1));
encoded_stats.set_max(int32_to_parquet_stats(100));
encoded_stats.has_null_count = false;
encoded_stats.all_null_value = false;
encoded_stats.null_count = 0;
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10);

auto stat_expression =
ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
ASSERT_TRUE(stat_expression.has_value());
EXPECT_EQ(stat_expression->ToString(),
"(((x >= 1) and (x <= 100)) or is_null(x, {nan_is_null=false}))");
}
{
// Special case: when num_value is 0, it would return
// "is_null".
::parquet::EncodedStatistics encoded_stats;
encoded_stats.has_null_count = true;
encoded_stats.null_count = 1;
encoded_stats.all_null_value = true;
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
auto stat_expression =
ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
ASSERT_TRUE(stat_expression.has_value());
EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})");

encoded_stats.has_null_count = false;
encoded_stats.all_null_value = false;
stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
ASSERT_TRUE(stat_expression.has_value());
EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})");
}
}

class DelayedBufferReader : public ::arrow::io::BufferReader {
public:
explicit DelayedBufferReader(const std::shared_ptr<::arrow::Buffer>& buffer)
Expand Down

0 comments on commit ab0a40e

Please sign in to comment.