diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index d5b5283c149d..49ac07230e69 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -39,6 +39,7 @@ use parquet_format::{ColumnChunk, ColumnMetaData, RowGroup}; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; use crate::errors::{ParquetError, Result}; +use crate::file::page_encoding_stats::{self, PageEncodingStats}; use crate::file::statistics::{self, Statistics}; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, @@ -349,6 +350,7 @@ pub struct ColumnChunkMetaData { index_page_offset: Option, dictionary_page_offset: Option, statistics: Option, + encoding_stats: Option>, bloom_filter_offset: Option, offset_index_offset: Option, offset_index_length: Option, @@ -467,6 +469,17 @@ impl ColumnChunkMetaData { self.statistics.as_ref() } + /// Returns `true` if this column chunk contains page encoding stats, `false` otherwise. + pub fn has_page_encoding_stats(&self) -> bool { + self.encoding_stats.is_some() + } + + /// Returns the offset for the page encoding stats, + /// or `None` if no page encoding stats are available. + pub fn page_encoding_stats(&self) -> Option<&Vec> { + self.encoding_stats.as_ref() + } + /// Returns `true` if this column chunk contains a bloom filter offset, `false` otherwise. pub fn has_bloom_filter(&self) -> bool { self.bloom_filter_offset.is_some() @@ -528,11 +541,16 @@ impl ColumnChunkMetaData { let index_page_offset = col_metadata.index_page_offset; let dictionary_page_offset = col_metadata.dictionary_page_offset; let statistics = statistics::from_thrift(column_type, col_metadata.statistics); + let encoding_stats = col_metadata + .encoding_stats + .as_ref() + .map(|vec| vec.iter().map(page_encoding_stats::from_thrift).collect()); let bloom_filter_offset = col_metadata.bloom_filter_offset; let offset_index_offset = cc.offset_index_offset; let offset_index_length = cc.offset_index_length; let column_index_offset = cc.column_index_offset; let column_index_length = cc.column_index_length; + let result = ColumnChunkMetaData { column_type, column_path, @@ -548,6 +566,7 @@ impl ColumnChunkMetaData { index_page_offset, dictionary_page_offset, statistics, + encoding_stats, bloom_filter_offset, offset_index_offset, offset_index_length, @@ -572,7 +591,10 @@ impl ColumnChunkMetaData { index_page_offset: self.index_page_offset, dictionary_page_offset: self.dictionary_page_offset, statistics: statistics::to_thrift(self.statistics.as_ref()), - encoding_stats: None, + encoding_stats: self + .encoding_stats + .as_ref() + .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), bloom_filter_offset: self.bloom_filter_offset, }; @@ -604,6 +626,7 @@ pub struct ColumnChunkMetaDataBuilder { index_page_offset: Option, dictionary_page_offset: Option, statistics: Option, + encoding_stats: Option>, bloom_filter_offset: Option, offset_index_offset: Option, offset_index_length: Option, @@ -627,6 +650,7 @@ impl ColumnChunkMetaDataBuilder { index_page_offset: None, dictionary_page_offset: None, statistics: None, + encoding_stats: None, bloom_filter_offset: None, offset_index_offset: None, offset_index_length: None, @@ -701,6 +725,12 @@ impl ColumnChunkMetaDataBuilder { self } + /// Sets page encoding stats for this column chunk. + pub fn set_page_encoding_stats(mut self, value: Vec) -> Self { + self.encoding_stats = Some(value); + self + } + /// Sets optional bloom filter offset in bytes. pub fn set_bloom_filter_offset(mut self, value: Option) -> Self { self.bloom_filter_offset = value; @@ -748,6 +778,7 @@ impl ColumnChunkMetaDataBuilder { index_page_offset: self.index_page_offset, dictionary_page_offset: self.dictionary_page_offset, statistics: self.statistics, + encoding_stats: self.encoding_stats, bloom_filter_offset: self.bloom_filter_offset, offset_index_offset: self.offset_index_offset, offset_index_length: self.offset_index_length, @@ -760,6 +791,7 @@ impl ColumnChunkMetaDataBuilder { #[cfg(test)] mod tests { use super::*; + use crate::basic::{Encoding, PageType}; #[test] fn test_row_group_metadata_thrift_conversion() { @@ -815,6 +847,18 @@ mod tests { .set_total_uncompressed_size(3000) .set_data_page_offset(4000) .set_dictionary_page_offset(Some(5000)) + .set_page_encoding_stats(vec![ + PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::PLAIN, + count: 3, + }, + PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::RLE, + count: 5, + }, + ]) .set_bloom_filter_offset(Some(6000)) .set_offset_index_offset(Some(7000)) .set_offset_index_length(Some(25)) diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index abd6ac62af13..78fb7ef11fe3 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -97,6 +97,7 @@ //! ``` pub mod footer; pub mod metadata; +pub mod page_encoding_stats; pub mod properties; pub mod reader; pub mod serialized_reader; diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs new file mode 100644 index 000000000000..3180c7820802 --- /dev/null +++ b/parquet/src/file/page_encoding_stats.rs @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::basic::{Encoding, PageType}; +use parquet_format::{ + Encoding as TEncoding, PageEncodingStats as TPageEncodingStats, PageType as TPageType, +}; + +/// PageEncodingStats for a column chunk and data page. +#[derive(Clone, Debug, PartialEq)] +pub struct PageEncodingStats { + /// the page type (data/dic/...) + pub page_type: PageType, + /// encoding of the page + pub encoding: Encoding, + /// number of pages of this type with this encoding + pub count: i32, +} + +/// Converts Thrift definition into `PageEncodingStats`. +pub fn from_thrift(thrift_encoding_stats: &TPageEncodingStats) -> PageEncodingStats { + let page_type = PageType::from(thrift_encoding_stats.page_type); + let encoding = Encoding::from(thrift_encoding_stats.encoding); + let count = thrift_encoding_stats.count; + + PageEncodingStats { + page_type, + encoding, + count, + } +} + +/// Converts `PageEncodingStats` into Thrift definition. +pub fn to_thrift(encoding_stats: &PageEncodingStats) -> TPageEncodingStats { + let page_type = TPageType::from(encoding_stats.page_type); + let encoding = TEncoding::from(encoding_stats.encoding); + let count = encoding_stats.count; + + TPageEncodingStats { + page_type, + encoding, + count, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::basic::{Encoding, PageType}; + + #[test] + fn test_page_encoding_stats_from_thrift() { + let stats = PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::PLAIN, + count: 1, + }; + + assert_eq!(from_thrift(&to_thrift(&stats)), stats); + } +} diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 4de3d1532200..4d19be8e6f0d 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -400,7 +400,7 @@ impl PageReader for SerializedPageReader { #[cfg(test)] mod tests { use super::*; - use crate::basic::ColumnOrder; + use crate::basic::{self, ColumnOrder}; use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; use crate::util::test_common::{get_test_file, get_test_path}; @@ -772,6 +772,15 @@ mod tests { // test optional bloom filter offset assert_eq!(col0_metadata.bloom_filter_offset().unwrap(), 192); + // test page encoding stats + assert!(col0_metadata.has_page_encoding_stats()); + let page_encoding_stats = + col0_metadata.page_encoding_stats().unwrap().get(0).unwrap(); + + assert_eq!(page_encoding_stats.page_type, basic::PageType::DATA_PAGE); + assert_eq!(page_encoding_stats.encoding, Encoding::PLAIN); + assert_eq!(page_encoding_stats.count, 1); + // test optional column index offset assert!(col0_metadata.has_column_index()); assert_eq!(col0_metadata.column_index_offset().unwrap(), 156);