Skip to content

Commit

Permalink
Merge pull request #8011 from Scoopit/right-way-to-shorten-a-string
Browse files Browse the repository at this point in the history
fix: Use unicode_segmentation to truncate INSERT statement
  • Loading branch information
Xuanwo authored Oct 2, 2022
2 parents b8f1a29 + 0c57f56 commit edcc1bf
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 1 deletion.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/query/legacy-parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ common-legacy-expression = { path = "../legacy-expression" }

async-trait = "0.1.57"
sqlparser = { git = "https://github.com/datafuse-extras/sqlparser-rs", rev = "7f246e3" }
unicode-segmentation = "^1.2"
45 changes: 44 additions & 1 deletion src/query/legacy-parser/src/sql_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use common_datavalues::prelude::*;
use common_exception::ErrorCode;
use common_exception::Result;
use sqlparser::ast::DataType as SQLDataType;
use unicode_segmentation::UnicodeSegmentation;

pub struct SQLCommon;

Expand Down Expand Up @@ -126,9 +127,51 @@ impl SQLCommon {
pub fn short_sql(query: &str) -> String {
let query = query.trim_start();
if query.len() >= 64 && query[..6].eq_ignore_ascii_case("INSERT") {
format!("{}...", &query[..64])
// keep first 64 graphemes
String::from_utf8(
query
.graphemes(true)
.take(64)
.flat_map(|g| g.as_bytes().iter())
.copied() // copied converts &u8 into u8
.chain(b"...".iter().copied())
.collect::<Vec<u8>>(),
)
.unwrap() // by construction, this cannot panic as we extracted unicode grapheme
} else {
query.to_string()
}
}
}

#[cfg(test)]
mod test {
use crate::sql_common::SQLCommon;

const LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT: &str =
"INSERT INTO `test` VALUES ('abcd', 'def'),('abcd', 'def'),('abcé', 'def');";

#[test]
#[should_panic]
fn test_invalid_string_truncation() {
// This test checks the INSERT statement did panic with byte truncated string.
// We need to do this to validate that the code of short_sql has fixed this panic!
format!("{}...", &LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT[..64]);
}

#[test]
fn test_short_sql_truncation_on_unicode() {
// short insert into statements are not truncated
assert_eq!(
SQLCommon::short_sql("INSERT INTO `test` VALUES('abcd', 'def');"),
"INSERT INTO `test` VALUES('abcd', 'def');"
);
// long one are at 64th char...
let shortned = SQLCommon::short_sql(LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT);
assert_eq!(shortned.len(), 68); // 64 chars with a multibyte one (é) + ...
assert_eq!(
shortned,
"INSERT INTO `test` VALUES ('abcd', 'def'),('abcd', 'def'),('abcé..."
);
}
}

1 comment on commit edcc1bf

@vercel
Copy link

@vercel vercel bot commented on edcc1bf Oct 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

databend – ./

databend-databend.vercel.app
databend-git-main-databend.vercel.app
databend.vercel.app
databend.rs

Please sign in to comment.