From 0f7af02f5d5bcc9b766078209c62f37d576e41f8 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 10 Dec 2024 15:00:14 -0800 Subject: [PATCH 01/10] fix a few edge cases with utf-8 incrementing --- parquet/src/column/writer/mod.rs | 131 ++++++++++++++++++++++++------- 1 file changed, 103 insertions(+), 28 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8d0be5f9f8c4..508ad204d3be 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1448,7 +1448,19 @@ fn increment_utf8(mut data: Vec) -> Option> { if str::from_utf8(&data).is_ok() { return Some(data); } - data[idx] = original; + // Incrementing "original" did not yield a valid unicode character, so it overflowed + // its available bits. If it was a continuation byte (b10xxxxxx) then set to min + // continuation (b10000000). Otherwise it was the first byte so set the entire char + // to all zeros. + if original & 0xc0u8 == 0x80u8 { + data[idx] = 0x80u8; + } else { + let byte_width = original.leading_ones() as usize; + match byte_width { + 0 => data[idx] = 0, + _ => data[idx..idx + byte_width].fill(0u8), + } + } } } @@ -1458,6 +1470,7 @@ fn increment_utf8(mut data: Vec) -> Option> { #[cfg(test)] mod tests { use crate::file::properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; + use core::str; use rand::distributions::uniform::SampleUniform; use std::sync::Arc; @@ -3136,39 +3149,64 @@ mod tests { #[test] fn test_increment_utf8() { + let test_inc = |o: &str, expected: &str| { + if let Ok(v) = String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) { + // Got the expected result... + assert_eq!(v, expected); + // and it's greater than the original string + assert!(*v > *o); + // Also show that BinaryArray level comparison works here + let mut greater = ByteArray::new(); + greater.set_data(Bytes::from(v)); + let mut original = ByteArray::new(); + original.set_data(Bytes::from(o.as_bytes().to_vec())); + assert!(greater > original); + } else { + panic!("Expected incremented UTF8 string to also be valid."); + } + }; + // Basic ASCII case - let v = increment_utf8("hello".as_bytes().to_vec()).unwrap(); - assert_eq!(&v, "hellp".as_bytes()); + test_inc("hello", "hellp"); + + // 1-byte ending in max 1-byte + test_inc("a\u{7f}", "b\x00"); - // Also show that BinaryArray level comparison works here - let mut greater = ByteArray::new(); - greater.set_data(Bytes::from(v)); - let mut original = ByteArray::new(); - original.set_data(Bytes::from("hello".as_bytes().to_vec())); - assert!(greater > original); + // 1-byte max should not truncate as it would need 2-byte code points + assert!(increment_utf8("\u{7f}\u{7f}".as_bytes().to_vec()).is_none()); // UTF8 string - let s = "โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’œ"; - let v = increment_utf8(s.as_bytes().to_vec()).unwrap(); + test_inc("โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’œ", "โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’"); - if let Ok(new) = String::from_utf8(v) { - assert_ne!(&new, s); - assert_eq!(new, "โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’"); - assert!(new.as_bytes().last().unwrap() > s.as_bytes().last().unwrap()); - } else { - panic!("Expected incremented UTF8 string to also be valid.") - } + // 2-byte without overflow + test_inc("รฉรฉรฉรฉ", "รฉรฉรฉรช"); - // Max UTF8 character - should be a No-Op - let s = char::MAX.to_string(); - assert_eq!(s.len(), 4); - let v = increment_utf8(s.as_bytes().to_vec()); - assert!(v.is_none()); + // 2-byte that overflows lowest byte + test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}"); + + // 2-byte ending in max 2-byte + test_inc("a\u{7ff}", "b\x00\x00"); + + // Max 2-byte should not truncate as it would need 3-byte code points + assert!(increment_utf8("\u{7ff}\u{7ff}".as_bytes().to_vec()).is_none()); + + // 3-byte without overflow + test_inc("เ €เ €", "เ €เ "); + + // 3-byte ending in max 3-byte + test_inc("a\u{ffff}", "b\x00\x00\x00"); + + // Max 3-byte should not truncate as it would need 4-byte code points + assert!(increment_utf8("\u{ffff}\u{ffff}".as_bytes().to_vec()).is_none()); - // Handle multi-byte UTF8 characters - let s = "a\u{10ffff}"; - let v = increment_utf8(s.as_bytes().to_vec()); - assert_eq!(&v.unwrap(), "b\u{10ffff}".as_bytes()); + // 4-byte without overflow + test_inc("๐€€๐€€", "๐€€๐€"); + + // 4-byte ending in max unicode + test_inc("a\u{10ffff}", "b\x00\x00\x00\x00"); + + // Max 4-byte should not truncate + assert!(increment_utf8("\u{10ffff}\u{10ffff}".as_bytes().to_vec()).is_none()); } #[test] @@ -3178,7 +3216,6 @@ mod tests { let r = truncate_utf8(data, data.as_bytes().len()).unwrap(); assert_eq!(r.len(), data.as_bytes().len()); assert_eq!(&r, data.as_bytes()); - println!("len is {}", data.len()); // We slice it away from the UTF8 boundary let r = truncate_utf8(data, 13).unwrap(); @@ -3188,6 +3225,44 @@ mod tests { // One multi-byte code point, and a length shorter than it, so we can't slice it let r = truncate_utf8("\u{0836}", 1); assert!(r.is_none()); + + // test truncate and increment for max bounds on utf-8 stats + // 7-bit (i.e. ASCII) + let r = truncate_utf8("yyyyyyyyy", 8) + .and_then(increment_utf8) + .unwrap(); + assert_eq!(&r, "yyyyyyyz".as_bytes()); + + // 2-byte without overflow + let r = truncate_utf8("รฉรฉรฉรฉรฉ", 8).and_then(increment_utf8).unwrap(); + assert_eq!(&r, "รฉรฉรฉรช".as_bytes()); + + // 2-byte that overflows lowest byte + let r = truncate_utf8("\u{ff}\u{ff}\u{ff}\u{ff}\u{ff}", 8) + .and_then(increment_utf8) + .unwrap(); + assert_eq!(&r, "\u{ff}\u{ff}\u{ff}\u{100}".as_bytes()); + + // max 2-byte should not truncate as it would need 3-byte code points + let r = truncate_utf8("฿ฟ฿ฟ฿ฟ฿ฟ฿ฟ", 8).and_then(increment_utf8); + assert!(r.is_none()); + + // 3-byte without overflow + let r = truncate_utf8("เ €เ €เ €", 8).and_then(increment_utf8).unwrap(); + assert_eq!(&r, "เ €เ ".as_bytes()); + assert_eq!(r.len(), 6); + + // max 3-byte should not truncate as it would need 4-byte code points + let r = truncate_utf8("\u{ffff}\u{ffff}\u{ffff}", 8).and_then(increment_utf8); + assert!(r.is_none()); + + // 4-byte without overflow + let r = truncate_utf8("๐€€๐€€๐€€", 8).and_then(increment_utf8).unwrap(); + assert_eq!(&r, "๐€€๐€".as_bytes()); + + // max 4-byte should not truncate + let r = truncate_utf8("\u{10ffff}\u{10ffff}", 8).and_then(increment_utf8); + assert!(r.is_none()); } #[test] From 52706f97b349bd0288e59e8a24755d11176caf63 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 10 Dec 2024 20:58:09 -0800 Subject: [PATCH 02/10] add todo --- parquet/src/column/writer/mod.rs | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 508ad204d3be..3c6f57213f01 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1440,26 +1440,34 @@ fn increment(mut data: Vec) -> Option> { /// Try and increment the the string's bytes from right to left, returning when the result /// is a valid UTF8 string. Returns `None` when it can't increment any byte. fn increment_utf8(mut data: Vec) -> Option> { + // TODO(ets): this seems like too many copies. Rethink the whole truncate+increment + // process. + let mut len = data.len(); for idx in (0..data.len()).rev() { let original = data[idx]; let (byte, overflow) = original.overflowing_add(1); if !overflow { data[idx] = byte; if str::from_utf8(&data).is_ok() { - return Some(data); + if len != data.len() { + return Some(data[..len].to_vec()); + } else { + return Some(data); + } } // Incrementing "original" did not yield a valid unicode character, so it overflowed // its available bits. If it was a continuation byte (b10xxxxxx) then set to min - // continuation (b10000000). Otherwise it was the first byte so set the entire char - // to all zeros. + // continuation (b10000000). Otherwise it was the first byte so set reset the first + // byte back to its original value (so data remains a valid string) and reduce "len". if original & 0xc0u8 == 0x80u8 { data[idx] = 0x80u8; } else { - let byte_width = original.leading_ones() as usize; - match byte_width { - 0 => data[idx] = 0, - _ => data[idx..idx + byte_width].fill(0u8), - } + data[idx] = original; + len -= if original < 0x80u8 { + 1 + } else { + original.leading_ones() as usize + }; } } } @@ -3170,7 +3178,7 @@ mod tests { test_inc("hello", "hellp"); // 1-byte ending in max 1-byte - test_inc("a\u{7f}", "b\x00"); + test_inc("a\u{7f}", "b"); // 1-byte max should not truncate as it would need 2-byte code points assert!(increment_utf8("\u{7f}\u{7f}".as_bytes().to_vec()).is_none()); @@ -3185,7 +3193,7 @@ mod tests { test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}"); // 2-byte ending in max 2-byte - test_inc("a\u{7ff}", "b\x00\x00"); + test_inc("a\u{7ff}", "b"); // Max 2-byte should not truncate as it would need 3-byte code points assert!(increment_utf8("\u{7ff}\u{7ff}".as_bytes().to_vec()).is_none()); @@ -3194,7 +3202,7 @@ mod tests { test_inc("เ €เ €", "เ €เ "); // 3-byte ending in max 3-byte - test_inc("a\u{ffff}", "b\x00\x00\x00"); + test_inc("a\u{ffff}", "b"); // Max 3-byte should not truncate as it would need 4-byte code points assert!(increment_utf8("\u{ffff}\u{ffff}".as_bytes().to_vec()).is_none()); @@ -3203,7 +3211,7 @@ mod tests { test_inc("๐€€๐€€", "๐€€๐€"); // 4-byte ending in max unicode - test_inc("a\u{10ffff}", "b\x00\x00\x00\x00"); + test_inc("a\u{10ffff}", "b"); // Max 4-byte should not truncate assert!(increment_utf8("\u{10ffff}\u{10ffff}".as_bytes().to_vec()).is_none()); From 80fa0dd24f1b9ea99fb923885ec1727378b45450 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 11 Dec 2024 08:23:28 -0800 Subject: [PATCH 03/10] simplify truncation --- parquet/src/column/writer/mod.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 3c6f57213f01..30eb5f0a6d22 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1440,8 +1440,9 @@ fn increment(mut data: Vec) -> Option> { /// Try and increment the the string's bytes from right to left, returning when the result /// is a valid UTF8 string. Returns `None` when it can't increment any byte. fn increment_utf8(mut data: Vec) -> Option> { - // TODO(ets): this seems like too many copies. Rethink the whole truncate+increment - // process. + const UTF8_CONTINUATION: u8 = 0x80; + const UTF8_CONTINUATION_MASK: u8 = 0xc0; + let mut len = data.len(); for idx in (0..data.len()).rev() { let original = data[idx]; @@ -1450,24 +1451,19 @@ fn increment_utf8(mut data: Vec) -> Option> { data[idx] = byte; if str::from_utf8(&data).is_ok() { if len != data.len() { - return Some(data[..len].to_vec()); - } else { - return Some(data); + data.truncate(len); } + return Some(data); } // Incrementing "original" did not yield a valid unicode character, so it overflowed // its available bits. If it was a continuation byte (b10xxxxxx) then set to min // continuation (b10000000). Otherwise it was the first byte so set reset the first // byte back to its original value (so data remains a valid string) and reduce "len". - if original & 0xc0u8 == 0x80u8 { - data[idx] = 0x80u8; + if original & UTF8_CONTINUATION_MASK == UTF8_CONTINUATION { + data[idx] = UTF8_CONTINUATION; } else { data[idx] = original; - len -= if original < 0x80u8 { - 1 - } else { - original.leading_ones() as usize - }; + len = idx; } } } From f1726ab5eeaac2118fe21acdfa9a25f9cfb03d8a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 11 Dec 2024 09:03:30 -0800 Subject: [PATCH 04/10] add another test --- parquet/src/column/writer/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 30eb5f0a6d22..7fe6533c8295 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -3211,6 +3211,9 @@ mod tests { // Max 4-byte should not truncate assert!(increment_utf8("\u{10ffff}\u{10ffff}".as_bytes().to_vec()).is_none()); + + // Skip over surrogate pair range (0xD800..=0xDFFF) + test_inc("a\u{D7FF}", "a\u{e000}"); } #[test] From c4d947470e44b85f33182917c8dd90867604c185 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 12 Dec 2024 10:03:05 -0800 Subject: [PATCH 05/10] note case where string should render right to left --- parquet/src/column/writer/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 23f548ac3b84..c46ef6be4454 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -3198,7 +3198,8 @@ mod tests { // Max 2-byte should not truncate as it would need 3-byte code points assert!(increment_utf8("\u{7ff}\u{7ff}".as_bytes().to_vec()).is_none()); - // 3-byte without overflow + // 3-byte without overflow [U+800, U+800] -> [U+800, U+801] (note that these + // characters should render right to left). test_inc("เ €เ €", "เ €เ "); // 3-byte ending in max 3-byte From 7a7fd0e0b08f029c6755277d581775d66388b9d4 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 13 Dec 2024 12:11:33 -0800 Subject: [PATCH 06/10] rework entirely, also avoid UTF8 processing if not required by the schema --- parquet/src/column/writer/mod.rs | 151 ++++++++++++++++++------------- 1 file changed, 86 insertions(+), 65 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index c46ef6be4454..6a66700c36a9 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -878,13 +878,26 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } + /// Returns `true` if this column's logical type is a UTF-8 string. + fn is_utf8(&self) -> bool { + self.get_descriptor().logical_type() == Some(LogicalType::String) + || self.get_descriptor().converted_type() == ConvertedType::UTF8 + } + fn truncate_min_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { truncation_length .filter(|l| data.len() > *l) - .and_then(|l| match str::from_utf8(data) { - Ok(str_data) => truncate_utf8(str_data, l), - Err(_) => Some(data[..l].to_vec()), - }) + .and_then(|l| + // don't do extra work if this column isn't UTF-8 + if self.is_utf8() { + match str::from_utf8(data) { + Ok(str_data) => truncate_utf8(str_data, l), + Err(_) => Some(data[..l].to_vec()), + } + } else { + Some(data[..l].to_vec()) + } + ) .map(|truncated| (truncated, true)) .unwrap_or_else(|| (data.to_vec(), false)) } @@ -892,10 +905,17 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { fn truncate_max_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { truncation_length .filter(|l| data.len() > *l) - .and_then(|l| match str::from_utf8(data) { - Ok(str_data) => truncate_utf8(str_data, l).and_then(increment_utf8), - Err(_) => increment(data[..l].to_vec()), - }) + .and_then(|l| + // don't do extra work if this column isn't UTF-8 + if self.is_utf8() { + match str::from_utf8(data) { + Ok(str_data) => truncate_and_increment_utf8(str_data, l), + Err(_) => increment(data[..l].to_vec()), + } + } else { + increment(data[..l].to_vec()) + } + ) .map(|truncated| (truncated, true)) .unwrap_or_else(|| (data.to_vec(), false)) } @@ -1418,57 +1438,61 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool { (a[1..]) > (b[1..]) } -/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 string, -/// while being less than `length` bytes and non-empty +/// Truncate a UTF-8 slice to the longest prefix that is still a valid UTF-8 string, +/// while being less than `length` bytes and non-empty. Returns `None` if truncation +/// is not possible within those constraints. +/// +/// The caller guarantees that data.len() > length. fn truncate_utf8(data: &str, length: usize) -> Option> { let split = (1..=length).rfind(|x| data.is_char_boundary(*x))?; Some(data.as_bytes()[..split].to_vec()) } -/// Try and increment the bytes from right to left. +/// Truncate a UTF-8 slice and increment it's final character. The returned value is the +/// longest such slice that is still a valid UTF-8 string while being less than `length` +/// bytes and non-empty. Returns `None` if no such transformation is possible. /// -/// Returns `None` if all bytes are set to `u8::MAX`. -fn increment(mut data: Vec) -> Option> { - for byte in data.iter_mut().rev() { - let (incremented, overflow) = byte.overflowing_add(1); - *byte = incremented; +/// The caller guarantees that data.len() > length. +fn truncate_and_increment_utf8(data: &str, length: usize) -> Option> { + // UTF-8 is max 4 bytes, so start search 3 back from desired length + let lower_bound = length.saturating_sub(3); + let split = (lower_bound..=length).rfind(|x| data.is_char_boundary(*x))?; + increment_utf8(data.get(..split)?) +} - if !overflow { - return Some(data); +/// Increment the final character in a UTF-8 string in such a way that the returned result +/// is still a valid UTF-8 string. The returned string may be shorter than the input if the +/// last character(s) cannot be incremented (due to overflow or producing invalid code points). +/// Returns `None` if the string cannot be incremented. +/// +/// Note that this implementation will not promote an N-byte code point to (N+1) bytes. +fn increment_utf8(data: &str) -> Option> { + for (idx, code_point) in data.char_indices().rev() { + let curr_len = code_point.len_utf8(); + let original = code_point as u32; + if let Some(next_char) = char::from_u32(original + 1) { + // do not allow increasing byte width of incremented char + if next_char.len_utf8() == curr_len { + let mut result = data.as_bytes()[..idx + curr_len].to_vec(); + next_char.encode_utf8(&mut result[idx..]); + return Some(result); + } } } None } -/// Try and increment the the string's bytes from right to left, returning when the result -/// is a valid UTF8 string. Returns `None` when it can't increment any byte. -fn increment_utf8(mut data: Vec) -> Option> { - const UTF8_CONTINUATION: u8 = 0x80; - const UTF8_CONTINUATION_MASK: u8 = 0xc0; +/// Try and increment the bytes from right to left. +/// +/// Returns `None` if all bytes are set to `u8::MAX`. +fn increment(mut data: Vec) -> Option> { + for byte in data.iter_mut().rev() { + let (incremented, overflow) = byte.overflowing_add(1); + *byte = incremented; - let mut len = data.len(); - for idx in (0..data.len()).rev() { - let original = data[idx]; - let (byte, overflow) = original.overflowing_add(1); if !overflow { - data[idx] = byte; - if str::from_utf8(&data).is_ok() { - if len != data.len() { - data.truncate(len); - } - return Some(data); - } - // Incrementing "original" did not yield a valid unicode character, so it overflowed - // its available bits. If it was a continuation byte (b10xxxxxx) then set to min - // continuation (b10000000). Otherwise it was the first byte so set reset the first - // byte back to its original value (so data remains a valid string) and reduce "len". - if original & UTF8_CONTINUATION_MASK == UTF8_CONTINUATION { - data[idx] = UTF8_CONTINUATION; - } else { - data[idx] = original; - len = idx; - } + return Some(data); } } @@ -3158,7 +3182,7 @@ mod tests { #[test] fn test_increment_utf8() { let test_inc = |o: &str, expected: &str| { - if let Ok(v) = String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) { + if let Ok(v) = String::from_utf8(increment_utf8(o).unwrap()) { // Got the expected result... assert_eq!(v, expected); // and it's greater than the original string @@ -3181,7 +3205,7 @@ mod tests { test_inc("a\u{7f}", "b"); // 1-byte max should not truncate as it would need 2-byte code points - assert!(increment_utf8("\u{7f}\u{7f}".as_bytes().to_vec()).is_none()); + assert!(increment_utf8("\u{7f}\u{7f}").is_none()); // UTF8 string test_inc("โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’œ", "โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’"); @@ -3196,7 +3220,7 @@ mod tests { test_inc("a\u{7ff}", "b"); // Max 2-byte should not truncate as it would need 3-byte code points - assert!(increment_utf8("\u{7ff}\u{7ff}".as_bytes().to_vec()).is_none()); + assert!(increment_utf8("\u{7ff}\u{7ff}").is_none()); // 3-byte without overflow [U+800, U+800] -> [U+800, U+801] (note that these // characters should render right to left). @@ -3206,7 +3230,7 @@ mod tests { test_inc("a\u{ffff}", "b"); // Max 3-byte should not truncate as it would need 4-byte code points - assert!(increment_utf8("\u{ffff}\u{ffff}".as_bytes().to_vec()).is_none()); + assert!(increment_utf8("\u{ffff}\u{ffff}").is_none()); // 4-byte without overflow test_inc("๐€€๐€€", "๐€€๐€"); @@ -3215,10 +3239,11 @@ mod tests { test_inc("a\u{10ffff}", "b"); // Max 4-byte should not truncate - assert!(increment_utf8("\u{10ffff}\u{10ffff}".as_bytes().to_vec()).is_none()); + assert!(increment_utf8("\u{10ffff}\u{10ffff}").is_none()); // Skip over surrogate pair range (0xD800..=0xDFFF) - test_inc("a\u{D7FF}", "a\u{e000}"); + //test_inc("a\u{D7FF}", "a\u{e000}"); + test_inc("a\u{D7FF}", "b"); } #[test] @@ -3238,42 +3263,38 @@ mod tests { let r = truncate_utf8("\u{0836}", 1); assert!(r.is_none()); - // test truncate and increment for max bounds on utf-8 stats + // Test truncate and increment for max bounds on UTF-8 statistics // 7-bit (i.e. ASCII) - let r = truncate_utf8("yyyyyyyyy", 8) - .and_then(increment_utf8) - .unwrap(); + let r = truncate_and_increment_utf8("yyyyyyyyy", 8).unwrap(); assert_eq!(&r, "yyyyyyyz".as_bytes()); // 2-byte without overflow - let r = truncate_utf8("รฉรฉรฉรฉรฉ", 8).and_then(increment_utf8).unwrap(); + let r = truncate_and_increment_utf8("รฉรฉรฉรฉรฉ", 8).unwrap(); assert_eq!(&r, "รฉรฉรฉรช".as_bytes()); // 2-byte that overflows lowest byte - let r = truncate_utf8("\u{ff}\u{ff}\u{ff}\u{ff}\u{ff}", 8) - .and_then(increment_utf8) - .unwrap(); + let r = truncate_and_increment_utf8("\u{ff}\u{ff}\u{ff}\u{ff}\u{ff}", 8).unwrap(); assert_eq!(&r, "\u{ff}\u{ff}\u{ff}\u{100}".as_bytes()); // max 2-byte should not truncate as it would need 3-byte code points - let r = truncate_utf8("฿ฟ฿ฟ฿ฟ฿ฟ฿ฟ", 8).and_then(increment_utf8); + let r = truncate_and_increment_utf8("฿ฟ฿ฟ฿ฟ฿ฟ฿ฟ", 8); assert!(r.is_none()); - // 3-byte without overflow - let r = truncate_utf8("เ €เ €เ €", 8).and_then(increment_utf8).unwrap(); + // 3-byte without overflow [U+800, U+800, U+800] -> [U+800, U+801] (note that these + // characters should render right to left). + let r = truncate_and_increment_utf8("เ €เ €เ €", 8).unwrap(); assert_eq!(&r, "เ €เ ".as_bytes()); - assert_eq!(r.len(), 6); // max 3-byte should not truncate as it would need 4-byte code points - let r = truncate_utf8("\u{ffff}\u{ffff}\u{ffff}", 8).and_then(increment_utf8); + let r = truncate_and_increment_utf8("\u{ffff}\u{ffff}\u{ffff}", 8); assert!(r.is_none()); // 4-byte without overflow - let r = truncate_utf8("๐€€๐€€๐€€", 8).and_then(increment_utf8).unwrap(); + let r = truncate_and_increment_utf8("๐€€๐€€๐€€", 8).unwrap(); assert_eq!(&r, "๐€€๐€".as_bytes()); // max 4-byte should not truncate - let r = truncate_utf8("\u{10ffff}\u{10ffff}", 8).and_then(increment_utf8); + let r = truncate_and_increment_utf8("\u{10ffff}\u{10ffff}", 8); assert!(r.is_none()); } From 400f5f8baed2ae9f477dbb1b67bb4349bcd00232 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sat, 14 Dec 2024 09:04:20 -0800 Subject: [PATCH 07/10] more consistent naming --- parquet/src/column/writer/mod.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 6a66700c36a9..2f33698f9d44 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1467,13 +1467,12 @@ fn truncate_and_increment_utf8(data: &str, length: usize) -> Option> { /// /// Note that this implementation will not promote an N-byte code point to (N+1) bytes. fn increment_utf8(data: &str) -> Option> { - for (idx, code_point) in data.char_indices().rev() { - let curr_len = code_point.len_utf8(); - let original = code_point as u32; - if let Some(next_char) = char::from_u32(original + 1) { + for (idx, original_char) in data.char_indices().rev() { + let original_len = original_char.len_utf8(); + if let Some(next_char) = char::from_u32(original_char as u32 + 1) { // do not allow increasing byte width of incremented char - if next_char.len_utf8() == curr_len { - let mut result = data.as_bytes()[..idx + curr_len].to_vec(); + if next_char.len_utf8() == original_len { + let mut result = data.as_bytes()[..idx + original_len].to_vec(); next_char.encode_utf8(&mut result[idx..]); return Some(result); } From 006a388b424876c1d7bb5f315bc81c142b736df2 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sat, 14 Dec 2024 09:23:17 -0800 Subject: [PATCH 08/10] modify some tests to truncate in the middle of a multibyte char --- parquet/src/column/writer/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 2f33698f9d44..80e1ac2b10e2 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -3268,8 +3268,8 @@ mod tests { assert_eq!(&r, "yyyyyyyz".as_bytes()); // 2-byte without overflow - let r = truncate_and_increment_utf8("รฉรฉรฉรฉรฉ", 8).unwrap(); - assert_eq!(&r, "รฉรฉรฉรช".as_bytes()); + let r = truncate_and_increment_utf8("รฉรฉรฉรฉรฉ", 7).unwrap(); + assert_eq!(&r, "รฉรฉรช".as_bytes()); // 2-byte that overflows lowest byte let r = truncate_and_increment_utf8("\u{ff}\u{ff}\u{ff}\u{ff}\u{ff}", 8).unwrap(); @@ -3281,7 +3281,7 @@ mod tests { // 3-byte without overflow [U+800, U+800, U+800] -> [U+800, U+801] (note that these // characters should render right to left). - let r = truncate_and_increment_utf8("เ €เ €เ €", 8).unwrap(); + let r = truncate_and_increment_utf8("เ €เ €เ €เ €", 8).unwrap(); assert_eq!(&r, "เ €เ ".as_bytes()); // max 3-byte should not truncate as it would need 4-byte code points @@ -3289,7 +3289,7 @@ mod tests { assert!(r.is_none()); // 4-byte without overflow - let r = truncate_and_increment_utf8("๐€€๐€€๐€€", 8).unwrap(); + let r = truncate_and_increment_utf8("๐€€๐€€๐€€๐€€", 9).unwrap(); assert_eq!(&r, "๐€€๐€".as_bytes()); // max 4-byte should not truncate From f251b00a1dd9705fa4a28547e7f8bbdea8ebb06a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 16 Dec 2024 09:51:54 -0800 Subject: [PATCH 09/10] add test and docstring --- parquet/src/column/writer/mod.rs | 70 +++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 80e1ac2b10e2..8188de0b54f0 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -902,6 +902,19 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .unwrap_or_else(|| (data.to_vec(), false)) } + /// Truncates a binary statistic to at most `truncation_length` bytes, and then increment the + /// final byte(s) to yield a valid upper bound. This may result in a result of less than + /// `truncation_length` bytes if the last byte(s) overflows. + /// + /// If truncation is not possible, returns `data`. + /// + /// The `bool` in the returned tuple indicates whether truncation occurred or not. + /// + /// UTF-8 Note: + /// If the column type indicates UTF-8, and `data` contains valid UTF-8, then the result will + /// also remain valid UTF-8 (but again may be less than `truncation_length` bytes). If `data` + /// does not contain valid UTF-8, then truncation will occur as if the column is non-string + /// binary. fn truncate_max_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { truncation_length .filter(|l| data.len() > *l) @@ -1500,10 +1513,13 @@ fn increment(mut data: Vec) -> Option> { #[cfg(test)] mod tests { - use crate::file::properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; + use crate::{ + file::{properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, writer::SerializedFileWriter}, + schema::parser::parse_message_type, + }; use core::str; use rand::distributions::uniform::SampleUniform; - use std::sync::Arc; + use std::{fs::File, sync::Arc}; use crate::column::{ page::PageReader, @@ -3297,6 +3313,56 @@ mod tests { assert!(r.is_none()); } + #[test] + // Check fallback truncation of statistics that should be UTF-8, but aren't + // (see https://github.com/apache/arrow-rs/pull/6870). + fn test_byte_array_truncate_invalid_utf8_statistics() { + let message_type = " + message test_schema { + OPTIONAL BYTE_ARRAY a (UTF8); + } + "; + let schema = Arc::new(parse_message_type(message_type).unwrap()); + + // Create Vec containing non-UTF8 bytes + let data = vec![ByteArray::from(vec![128u8; 32]); 7]; + let def_levels = [1, 1, 1, 1, 0, 1, 0, 1, 0, 1]; + let file: File = tempfile::tempfile().unwrap(); + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Chunk) + .set_statistics_truncate_length(Some(8)) + .build(), + ); + + let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&data, Some(&def_levels), None) + .unwrap(); + col_writer.close().unwrap(); + row_group_writer.close().unwrap(); + let file_metadata = writer.close().unwrap(); + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let stats = file_metadata.row_groups[0].columns[0] + .meta_data + .as_ref() + .unwrap() + .statistics + .as_ref() + .unwrap(); + assert!(!stats.is_max_value_exact.unwrap()); + // Truncation of invalid UTF-8 should fall back to binary truncation, so last byte should + // be incremented by 1. + assert_eq!( + stats.max_value, + Some([128, 128, 128, 128, 128, 128, 128, 129].to_vec()) + ); + } + #[test] fn test_increment_max_binary_chars() { let r = increment(vec![0xFF, 0xFE, 0xFD, 0xFF, 0xFF]); From e7d0af8b200d94bcdbafb5811373c38b46be6354 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 16 Dec 2024 10:00:41 -0800 Subject: [PATCH 10/10] document truncate_min_value too --- parquet/src/column/writer/mod.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8188de0b54f0..8dc1d0db4476 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -884,6 +884,16 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { || self.get_descriptor().converted_type() == ConvertedType::UTF8 } + /// Truncates a binary statistic to at most `truncation_length` bytes. + /// + /// If truncation is not possible, returns `data`. + /// + /// The `bool` in the returned tuple indicates whether truncation occurred or not. + /// + /// UTF-8 Note: + /// If the column type indicates UTF-8, and `data` contains valid UTF-8, then the result will + /// also remain valid UTF-8, but may be less tnan `truncation_length` bytes to avoid splitting + /// on non-character boundaries. fn truncate_min_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { truncation_length .filter(|l| data.len() > *l)