diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index b1534aeaa9969..d9cb24c7fe0de 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1701,6 +1701,7 @@ Status TypedColumnWriterImpl::WriteArrowDictionary( auto update_stats = [&](int64_t num_chunk_levels, const std::shared_ptr& chunk_indices) { + DCHECK(page_statistics_ != nullptr || bloom_filter_ != nullptr, "No stats or filter"); // TODO(PARQUET-2068) This approach may make two copies. First, a copy of the // indices array to a (hopefully smaller) referenced indices array. Second, a copy // of the values array to a (probably not smaller) referenced values array. @@ -1725,9 +1726,8 @@ Status TypedColumnWriterImpl::WriteArrowDictionary( &exec_ctx)); referenced_dictionary = referenced_dictionary_datum.make_array(); } - - int64_t non_null_count = chunk_indices->length() - chunk_indices->null_count(); if (page_statistics_ != nullptr) { + int64_t non_null_count = chunk_indices->length() - chunk_indices->null_count(); page_statistics_->IncrementNullCount(num_chunk_levels - non_null_count); page_statistics_->IncrementNumValues(non_null_count); page_statistics_->Update(*referenced_dictionary, /*update_counts=*/false); @@ -2426,7 +2426,9 @@ void TypedColumnWriterImpl::UpdateBloomFilter(const FLBA* values, template <> void TypedColumnWriterImpl::UpdateBloomFilter(const bool*, int64_t) { - DCHECK(bloom_filter_ == nullptr); + if (ARROW_PREDICT_FALSE(bloom_filter_ != nullptr)) { + throw ParquetException("BooleanType does not support bloom filters"); + } } template diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 5c178dda4732e..6fa2e16311532 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -516,9 +516,10 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { using RowGroupIndexLocation = std::vector>; /// Alias type of bloom filter location of a row group. The filter location -/// is located by column ordinal. Number of columns with a bloom filter to -/// be relatively small compared to the number of overall columns, so -/// map is used. +/// is located by column ordinal. +/// +/// Number of columns with a bloom filter to be relatively small compared to +/// the number of overall columns, so map is used. using RowGroupBloomFilterLocation = std::map; /// Alias type of page index and location of a parquet file. The diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h index 89c49cf7a896f..d45c59cab223f 100644 --- a/cpp/src/parquet/page_index.h +++ b/cpp/src/parquet/page_index.h @@ -17,7 +17,7 @@ #pragma once -#include "arrow/io/type_fwd.h" +#include "arrow/io/interfaces.h" #include "parquet/encryption/type_fwd.h" #include "parquet/types.h" diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 01ba164c8f9ee..ad88b4fc2f970 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -599,7 +599,7 @@ class PARQUET_EXPORT WriterProperties { } /// Disable bloom filter for the column specified by `path`. - /// Default enabled. + /// Default disabled. Builder* disable_bloom_filter(const std::shared_ptr& path) { return this->disable_bloom_filter(path->ToDotString()); }