Skip to content

Commit

Permalink
add test
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Aug 22, 2024
1 parent f7929f3 commit 132363c
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 3 deletions.
12 changes: 9 additions & 3 deletions cpp/src/arrow/dataset/file_parquet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,10 +366,16 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
const parquet::Statistics& statistics) {
auto field_expr = compute::field_ref(field_ref);

bool may_has_null = !statistics.HasNullCount() || statistics.null_count() != 0;
bool may_has_null = !statistics.HasNullCount() || statistics.null_count() > 0;
bool must_has_null = statistics.HasNullCount() && statistics.null_count() > 0;
// Optimize for corner case where all values are nulls
if (statistics.num_values() == 0 && may_has_null) {
return is_null(std::move(field_expr));
if (statistics.num_values() == 0) {
if (must_has_null) {
return is_null(std::move(field_expr));
}
// If there are no values and no nulls, it might be empty or contains
// only null.
return std::nullopt;
}

std::shared_ptr<Scalar> min, max;
Expand Down
47 changes: 47 additions & 0 deletions cpp/src/arrow/dataset/file_parquet_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,53 @@ TEST(TestParquetStatistics, NullMax) {
EXPECT_EQ(stat_expression->ToString(), "(x >= 1)");
}

TEST(TestParquetStatistics, NoNullCount) {
auto field = ::arrow::field("x", int32());
auto parquet_node_ptr = ::parquet::schema::Int32("x", ::parquet::Repetition::REQUIRED);
::parquet::ColumnDescriptor descr(parquet_node_ptr, /*max_definition_level=*/1,
/*max_repetition_level=*/0);

auto int32_to_parquet_stats = [](int32_t v) {
std::string value;
value.resize(sizeof(int32_t));
memcpy(value.data(), &v, sizeof(int32_t));
return value;
};
{
// Base case: when null_count is not set, the expression might contain null
::parquet::EncodedStatistics encoded_stats;
encoded_stats.set_min(int32_to_parquet_stats(1));
encoded_stats.set_max(int32_to_parquet_stats(100));
encoded_stats.has_null_count = false;
encoded_stats.all_null_value = false;
encoded_stats.null_count = 0;
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10);

auto stat_expression =
ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
EXPECT_EQ(stat_expression->ToString(),
"(((x >= 1) and (x <= 100)) or is_null(x, {nan_is_null=false}))");
}
{
// Special case: when num_value is 0, if has_null, it would return
// "is_null", otherwise it cannot gurantees anything
::parquet::EncodedStatistics encoded_stats;
encoded_stats.has_null_count = true;
encoded_stats.null_count = 1;
encoded_stats.all_null_value = true;
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
auto stat_expression =
ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})");

encoded_stats.has_null_count = false;
encoded_stats.all_null_value = false;
stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
EXPECT_FALSE(stat_expression.has_value());
}
}

class DelayedBufferReader : public ::arrow::io::BufferReader {
public:
explicit DelayedBufferReader(const std::shared_ptr<::arrow::Buffer>& buffer)
Expand Down

0 comments on commit 132363c

Please sign in to comment.