From 0191acfc917afbf4ce1cccbc399e9584b52d91e8 Mon Sep 17 00:00:00 2001 From: Shani Solomon Date: Tue, 15 Feb 2022 09:05:37 +0200 Subject: [PATCH 1/7] init --- parquet/src/file/metadata.rs | 34 ++++++++++++++++++ parquet/src/file/mod.rs | 1 + parquet/src/file/page_encoding_stats.rs | 48 +++++++++++++++++++++++++ parquet/src/file/serialized_reader.rs | 20 +++++++++++ 4 files changed, 103 insertions(+) create mode 100644 parquet/src/file/page_encoding_stats.rs diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 420c2fee225b..56bcd75d337f 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -39,6 +39,8 @@ use parquet_format::{ColumnChunk, ColumnMetaData, RowGroup}; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; use crate::errors::{ParquetError, Result}; +use crate::file::page_encoding_stats; +use crate::file::page_encoding_stats::PageEncodingStats; use crate::file::statistics::{self, Statistics}; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, @@ -349,6 +351,7 @@ pub struct ColumnChunkMetaData { index_page_offset: Option, dictionary_page_offset: Option, statistics: Option, + encoding_stats: Option>, } /// Represents common operations for a column chunk. @@ -462,6 +465,17 @@ impl ColumnChunkMetaData { self.statistics.as_ref() } + /// Returns `true` if this column chunk contains page encoding stats, `false` otherwise. + pub fn has_page_encoding_stats(&self) -> bool { + self.encoding_stats.is_some() + } + + /// Returns the offset for the page encoding stats, + /// or `None` if no page encoding stats are available. + pub fn page_encoding_stats(&self) -> Option<&Vec> { + self.encoding_stats.as_ref() + } + /// Method to convert from Thrift. pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { if cc.meta_data.is_none() { @@ -485,6 +499,16 @@ impl ColumnChunkMetaData { let index_page_offset = col_metadata.index_page_offset; let dictionary_page_offset = col_metadata.dictionary_page_offset; let statistics = statistics::from_thrift(column_type, col_metadata.statistics); + let encoding_stats = match col_metadata.encoding_stats { + Some(encodings) => encodings + .iter() + .map(|p_encoding_stats| { + page_encoding_stats::from_thrift(p_encoding_stats) + }) + .collect(), + None => None, + }; + let result = ColumnChunkMetaData { column_type, column_path, @@ -500,6 +524,7 @@ impl ColumnChunkMetaData { index_page_offset, dictionary_page_offset, statistics, + encoding_stats, }; Ok(result) } @@ -551,6 +576,7 @@ pub struct ColumnChunkMetaDataBuilder { index_page_offset: Option, dictionary_page_offset: Option, statistics: Option, + encoding_stats: Option>, } impl ColumnChunkMetaDataBuilder { @@ -569,6 +595,7 @@ impl ColumnChunkMetaDataBuilder { index_page_offset: None, dictionary_page_offset: None, statistics: None, + encoding_stats: None, } } @@ -638,6 +665,12 @@ impl ColumnChunkMetaDataBuilder { self } + /// Sets page encoding stats for this column chunk. + pub fn set_page_encoding_stats(mut self, value: Vec) -> Self { + self.encoding_stats = Some(value); + self + } + /// Builds column chunk metadata. pub fn build(self) -> Result { Ok(ColumnChunkMetaData { @@ -655,6 +688,7 @@ impl ColumnChunkMetaDataBuilder { index_page_offset: self.index_page_offset, dictionary_page_offset: self.dictionary_page_offset, statistics: self.statistics, + encoding_stats: self.encoding_stats, }) } } diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index abd6ac62af13..b90802f81dcc 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -97,6 +97,7 @@ //! ``` pub mod footer; pub mod metadata; +mod page_encoding_stats; pub mod properties; pub mod reader; pub mod serialized_reader; diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs new file mode 100644 index 000000000000..ec8efbc2e7c2 --- /dev/null +++ b/parquet/src/file/page_encoding_stats.rs @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use parquet_format::PageEncodingStats as TPageEncodingStats; + +use crate::basic::{Encoding, PageType}; + +/// PageEncodingStats for a column chunk and data page. +#[derive(Clone, Debug, PartialEq)] +pub struct PageEncodingStats { + /// the page type (data/dic/...) + pub page_type: PageType, + /// encoding of the page + pub encoding: Encoding, + /// number of pages of this type with this encoding + pub count: i32, +} + +impl PageEncodingStats {} + +/// Converts Thrift definition into `PageEncodingStats`. +pub fn from_thrift( + thrift_encoding_stats: &TPageEncodingStats, +) -> Option { + let page_type = PageType::from(thrift_encoding_stats.page_type); + let encoding = Encoding::from(thrift_encoding_stats.encoding); + let count = thrift_encoding_stats.count; + + Some(PageEncodingStats { + page_type, + encoding, + count, + }) +} diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 420d2e22645f..77516a7dab46 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -400,6 +400,7 @@ impl PageReader for SerializedPageReader { #[cfg(test)] mod tests { use super::*; + use crate::basic; use crate::basic::ColumnOrder; use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; @@ -760,6 +761,25 @@ mod tests { ); } + #[test] + fn test_file_reader_optional_metadata() { + let file = get_test_file("data_index_bloom.parquet"); + let file_reader = Arc::new(SerializedFileReader::new(file).unwrap()); + let col_metadata = file_reader.metadata.row_group(0).column(0); + + // test page encoding stats + assert!(col_metadata.has_page_encoding_stats()); + let page_encoding_stats = col_metadata + .page_encoding_stats() + .unwrap() + .get(0) + .unwrap(); + + assert_eq!(page_encoding_stats.page_type, basic::PageType::DATA_PAGE); + assert_eq!(page_encoding_stats.encoding, Encoding::PLAIN); + assert_eq!(page_encoding_stats.count, 1); + } + #[test] fn test_file_reader_filter_row_groups() -> Result<()> { let test_file = get_test_file("alltypes_plain.parquet"); From 475a8b155976ac9fbe14d1698a7de3f33d96652f Mon Sep 17 00:00:00 2001 From: Shani Solomon Date: Tue, 15 Feb 2022 10:19:48 +0200 Subject: [PATCH 2/7] replaced test file --- parquet/src/file/serialized_reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 77516a7dab46..28b52db3a654 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -763,7 +763,7 @@ mod tests { #[test] fn test_file_reader_optional_metadata() { - let file = get_test_file("data_index_bloom.parquet"); + let file = get_test_file("data_index_bloom_encoding_stats.parquet"); let file_reader = Arc::new(SerializedFileReader::new(file).unwrap()); let col_metadata = file_reader.metadata.row_group(0).column(0); From 2351e458143407b4bb346e3495217445b075d999 Mon Sep 17 00:00:00 2001 From: Shani Solomon Date: Tue, 15 Feb 2022 13:52:43 +0200 Subject: [PATCH 3/7] init --- parquet/src/file/metadata.rs | 23 +++++++++---- parquet/src/file/page_encoding_stats.rs | 46 ++++++++++++++++++++----- parquet/src/file/serialized_reader.rs | 7 ++-- 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 56bcd75d337f..b9b8ce517f4b 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -500,12 +500,12 @@ impl ColumnChunkMetaData { let dictionary_page_offset = col_metadata.dictionary_page_offset; let statistics = statistics::from_thrift(column_type, col_metadata.statistics); let encoding_stats = match col_metadata.encoding_stats { - Some(encodings) => encodings - .iter() - .map(|p_encoding_stats| { - page_encoding_stats::from_thrift(p_encoding_stats) - }) - .collect(), + Some(encodings) => Some( + encodings + .iter() + .map(|v| page_encoding_stats::from_thrift(v)) + .collect(), + ), None => None, }; @@ -531,6 +531,15 @@ impl ColumnChunkMetaData { /// Method to convert to Thrift. pub fn to_thrift(&self) -> ColumnChunk { + let page_encoding_stats = match self.encoding_stats.as_ref() { + Some(vec) => Some( + vec.iter() + .map(|v| page_encoding_stats::to_thrift(&v)) + .collect(), + ), + None => None, + }; + let column_metadata = ColumnMetaData { type_: self.column_type.into(), encodings: self.encodings().iter().map(|&v| v.into()).collect(), @@ -544,7 +553,7 @@ impl ColumnChunkMetaData { index_page_offset: self.index_page_offset, dictionary_page_offset: self.dictionary_page_offset, statistics: statistics::to_thrift(self.statistics.as_ref()), - encoding_stats: None, + encoding_stats: page_encoding_stats, bloom_filter_offset: None, }; diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs index ec8efbc2e7c2..7466585e459d 100644 --- a/parquet/src/file/page_encoding_stats.rs +++ b/parquet/src/file/page_encoding_stats.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. -use parquet_format::PageEncodingStats as TPageEncodingStats; - use crate::basic::{Encoding, PageType}; +use parquet_format::{ + Encoding as TEncoding, PageEncodingStats as TPageEncodingStats, PageType as TPageType, +}; /// PageEncodingStats for a column chunk and data page. #[derive(Clone, Debug, PartialEq)] @@ -30,19 +31,46 @@ pub struct PageEncodingStats { pub count: i32, } -impl PageEncodingStats {} - /// Converts Thrift definition into `PageEncodingStats`. -pub fn from_thrift( - thrift_encoding_stats: &TPageEncodingStats, -) -> Option { +pub fn from_thrift(thrift_encoding_stats: &TPageEncodingStats) -> PageEncodingStats { let page_type = PageType::from(thrift_encoding_stats.page_type); let encoding = Encoding::from(thrift_encoding_stats.encoding); let count = thrift_encoding_stats.count; - Some(PageEncodingStats { + PageEncodingStats { + page_type, + encoding, + count, + } +} + +/// Converts `PageEncodingStats` into Thrift definition. +pub fn to_thrift(encoding_stats: &PageEncodingStats) -> TPageEncodingStats { + let page_type = TPageType::from(encoding_stats.page_type); + let encoding = TEncoding::from(encoding_stats.encoding); + let count = encoding_stats.count; + + TPageEncodingStats { page_type, encoding, count, - }) + } +} + +#[cfg(test)] +mod tests { + use crate::basic::{Encoding, PageType}; + use crate::file::page_encoding_stats::{from_thrift, to_thrift, PageEncodingStats}; + + #[test] + fn test_page_encoding_stats_from_thrift() { + let stats = PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::PLAIN, + count: 1, + }; + + let thrift_stats = to_thrift(&stats); + assert_eq!(from_thrift(&thrift_stats), stats); + } } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 28b52db3a654..5d931d698062 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -769,11 +769,8 @@ mod tests { // test page encoding stats assert!(col_metadata.has_page_encoding_stats()); - let page_encoding_stats = col_metadata - .page_encoding_stats() - .unwrap() - .get(0) - .unwrap(); + let page_encoding_stats = + col_metadata.page_encoding_stats().unwrap().get(0).unwrap(); assert_eq!(page_encoding_stats.page_type, basic::PageType::DATA_PAGE); assert_eq!(page_encoding_stats.encoding, Encoding::PLAIN); From f654186e459a9f83a02a8512e9319a10c33b87d5 Mon Sep 17 00:00:00 2001 From: Shani Solomon Date: Wed, 16 Feb 2022 11:34:56 +0200 Subject: [PATCH 4/7] thrift conversion --- parquet/src/file/metadata.rs | 13 ++++++------- parquet/src/file/serialized_reader.rs | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 4a0f96e06af2..0d4c6d6df8de 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -334,7 +334,7 @@ impl RowGroupMetaDataBuilder { } /// Metadata for a column chunk. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct ColumnChunkMetaData { column_type: Type, column_path: ColumnPath, @@ -726,6 +726,7 @@ impl ColumnChunkMetaDataBuilder { #[cfg(test)] mod tests { use super::*; + use crate::basic::{PageType, Encoding}; // todo see if we can remove #[test] fn test_row_group_metadata_thrift_conversion() { @@ -781,17 +782,15 @@ mod tests { .set_total_uncompressed_size(3000) .set_data_page_offset(4000) .set_dictionary_page_offset(Some(5000)) + .set_page_encoding_stats(vec![PageEncodingStats{ page_type: PageType::DATA_PAGE, encoding: Encoding::PLAIN, count: 3}]) .build() .unwrap(); - let col_chunk_exp = col_metadata.to_thrift(); - let col_chunk_res = - ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone()) - .unwrap() - .to_thrift(); + ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()) + .unwrap(); - assert_eq!(col_chunk_res, col_chunk_exp); + assert_eq!(col_chunk_res, col_metadata); } #[test] diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index ad39c15f4f83..a96d7d7dd7bb 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -774,9 +774,9 @@ mod tests { assert_eq!(col0_metadata.bloom_filter_offset().unwrap(), 192); // test page encoding stats - assert!(col_metadata.has_page_encoding_stats()); + assert!(col0_metadata.has_page_encoding_stats()); let page_encoding_stats = - col_metadata.page_encoding_stats().unwrap().get(0).unwrap(); + col0_metadata.page_encoding_stats().unwrap().get(0).unwrap(); assert_eq!(page_encoding_stats.page_type, basic::PageType::DATA_PAGE); assert_eq!(page_encoding_stats.encoding, Encoding::PLAIN); From 1794c2048ccf8b482800b00f34b021b4b87a4fa0 Mon Sep 17 00:00:00 2001 From: Shani Solomon Date: Wed, 16 Feb 2022 12:05:59 +0200 Subject: [PATCH 5/7] refactor --- parquet/src/file/metadata.rs | 8 ++++++-- parquet/src/file/page_encoding_stats.rs | 5 ++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 0d4c6d6df8de..3d75960678b3 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -726,7 +726,7 @@ impl ColumnChunkMetaDataBuilder { #[cfg(test)] mod tests { use super::*; - use crate::basic::{PageType, Encoding}; // todo see if we can remove + use crate::basic::{Encoding, PageType}; #[test] fn test_row_group_metadata_thrift_conversion() { @@ -782,7 +782,11 @@ mod tests { .set_total_uncompressed_size(3000) .set_data_page_offset(4000) .set_dictionary_page_offset(Some(5000)) - .set_page_encoding_stats(vec![PageEncodingStats{ page_type: PageType::DATA_PAGE, encoding: Encoding::PLAIN, count: 3}]) + .set_page_encoding_stats(vec![PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::PLAIN, + count: 3, + }]) .build() .unwrap(); diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs index 7466585e459d..3180c7820802 100644 --- a/parquet/src/file/page_encoding_stats.rs +++ b/parquet/src/file/page_encoding_stats.rs @@ -59,8 +59,8 @@ pub fn to_thrift(encoding_stats: &PageEncodingStats) -> TPageEncodingStats { #[cfg(test)] mod tests { + use super::*; use crate::basic::{Encoding, PageType}; - use crate::file::page_encoding_stats::{from_thrift, to_thrift, PageEncodingStats}; #[test] fn test_page_encoding_stats_from_thrift() { @@ -70,7 +70,6 @@ mod tests { count: 1, }; - let thrift_stats = to_thrift(&stats); - assert_eq!(from_thrift(&thrift_stats), stats); + assert_eq!(from_thrift(&to_thrift(&stats)), stats); } } From e4a7b136ba5a78b569f63687627c6612638e3b0a Mon Sep 17 00:00:00 2001 From: Shani Solomon Date: Wed, 16 Feb 2022 18:51:57 +0200 Subject: [PATCH 6/7] tests --- parquet/src/file/metadata.rs | 19 +++++++++++++------ parquet/src/file/mod.rs | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index e7913f169c70..66d3c5f5c44c 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -566,7 +566,7 @@ impl ColumnChunkMetaData { dictionary_page_offset: self.dictionary_page_offset, statistics: statistics::to_thrift(self.statistics.as_ref()), encoding_stats: page_encoding_stats, - bloom_filter_offset: None, // todo verify + bloom_filter_offset: self.bloom_filter_offset, }; ColumnChunk { @@ -782,11 +782,18 @@ mod tests { .set_total_uncompressed_size(3000) .set_data_page_offset(4000) .set_dictionary_page_offset(Some(5000)) - .set_page_encoding_stats(vec![PageEncodingStats { - page_type: PageType::DATA_PAGE, - encoding: Encoding::PLAIN, - count: 3, - }]) + .set_page_encoding_stats(vec![ + PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::PLAIN, + count: 3, + }, + PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::RLE, + count: 5, + }, + ]) .set_bloom_filter_offset(Some(6000)) .build() .unwrap(); diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index b90802f81dcc..78fb7ef11fe3 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -97,7 +97,7 @@ //! ``` pub mod footer; pub mod metadata; -mod page_encoding_stats; +pub mod page_encoding_stats; pub mod properties; pub mod reader; pub mod serialized_reader; From 9db23f8c2ab4364db5c87f9034730e7ebfb87f5b Mon Sep 17 00:00:00 2001 From: Shani Solomon Date: Thu, 17 Feb 2022 17:56:51 +0200 Subject: [PATCH 7/7] clippy --- parquet/src/file/metadata.rs | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 37fcf2799b61..49ac07230e69 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -541,15 +541,10 @@ impl ColumnChunkMetaData { let index_page_offset = col_metadata.index_page_offset; let dictionary_page_offset = col_metadata.dictionary_page_offset; let statistics = statistics::from_thrift(column_type, col_metadata.statistics); - let encoding_stats = match col_metadata.encoding_stats { - Some(encodings) => Some( - encodings - .iter() - .map(|v| page_encoding_stats::from_thrift(v)) - .collect(), - ), - None => None, - }; + let encoding_stats = col_metadata + .encoding_stats + .as_ref() + .map(|vec| vec.iter().map(page_encoding_stats::from_thrift).collect()); let bloom_filter_offset = col_metadata.bloom_filter_offset; let offset_index_offset = cc.offset_index_offset; let offset_index_length = cc.offset_index_length; @@ -583,15 +578,6 @@ impl ColumnChunkMetaData { /// Method to convert to Thrift. pub fn to_thrift(&self) -> ColumnChunk { - let page_encoding_stats = match self.encoding_stats.as_ref() { - Some(vec) => Some( - vec.iter() - .map(|v| page_encoding_stats::to_thrift(&v)) - .collect(), - ), - None => None, - }; - let column_metadata = ColumnMetaData { type_: self.column_type.into(), encodings: self.encodings().iter().map(|&v| v.into()).collect(), @@ -605,7 +591,10 @@ impl ColumnChunkMetaData { index_page_offset: self.index_page_offset, dictionary_page_offset: self.dictionary_page_offset, statistics: statistics::to_thrift(self.statistics.as_ref()), - encoding_stats: page_encoding_stats, + encoding_stats: self + .encoding_stats + .as_ref() + .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), bloom_filter_offset: self.bloom_filter_offset, };