diff --git a/Cargo.lock b/Cargo.lock index 7792ef809b9b..6824739e3d92 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1573,6 +1573,7 @@ dependencies = [ "common-functions", "common-legacy-expression", "sqlparser", + "unicode-segmentation", ] [[package]] diff --git a/src/query/legacy-parser/Cargo.toml b/src/query/legacy-parser/Cargo.toml index 149ca678a986..a0c941825179 100644 --- a/src/query/legacy-parser/Cargo.toml +++ b/src/query/legacy-parser/Cargo.toml @@ -16,3 +16,4 @@ common-legacy-expression = { path = "../legacy-expression" } async-trait = "0.1.57" sqlparser = { git = "https://github.com/datafuse-extras/sqlparser-rs", rev = "7f246e3" } +unicode-segmentation = "^1.2" diff --git a/src/query/legacy-parser/src/sql_common.rs b/src/query/legacy-parser/src/sql_common.rs index f80f8238387f..be74596143ff 100644 --- a/src/query/legacy-parser/src/sql_common.rs +++ b/src/query/legacy-parser/src/sql_common.rs @@ -18,6 +18,7 @@ use common_datavalues::prelude::*; use common_exception::ErrorCode; use common_exception::Result; use sqlparser::ast::DataType as SQLDataType; +use unicode_segmentation::UnicodeSegmentation; pub struct SQLCommon; @@ -126,9 +127,51 @@ impl SQLCommon { pub fn short_sql(query: &str) -> String { let query = query.trim_start(); if query.len() >= 64 && query[..6].eq_ignore_ascii_case("INSERT") { - format!("{}...", &query[..64]) + // keep first 64 graphemes + String::from_utf8( + query + .graphemes(true) + .take(64) + .flat_map(|g| g.as_bytes().iter()) + .copied() // copied converts &u8 into u8 + .chain(b"...".iter().copied()) + .collect::>(), + ) + .unwrap() // by construction, this cannot panic as we extracted unicode grapheme } else { query.to_string() } } } + +#[cfg(test)] +mod test { + use crate::sql_common::SQLCommon; + + const LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT: &str = + "INSERT INTO `test` VALUES ('abcd', 'def'),('abcd', 'def'),('abcé', 'def');"; + + #[test] + #[should_panic] + fn test_invalid_string_truncation() { + // This test checks the INSERT statement did panic with byte truncated string. + // We need to do this to validate that the code of short_sql has fixed this panic! + format!("{}...", &LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT[..64]); + } + + #[test] + fn test_short_sql_truncation_on_unicode() { + // short insert into statements are not truncated + assert_eq!( + SQLCommon::short_sql("INSERT INTO `test` VALUES('abcd', 'def');"), + "INSERT INTO `test` VALUES('abcd', 'def');" + ); + // long one are at 64th char... + let shortned = SQLCommon::short_sql(LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT); + assert_eq!(shortned.len(), 68); // 64 chars with a multibyte one (é) + ... + assert_eq!( + shortned, + "INSERT INTO `test` VALUES ('abcd', 'def'),('abcd', 'def'),('abcé..." + ); + } +}