From bc15f7b4ceab849a974e84fcd38bde353cb7c2d1 Mon Sep 17 00:00:00 2001 From: Ophir LOJKINE Date: Mon, 29 Jul 2024 23:18:16 +0200 Subject: [PATCH] Support for postgres String Constants with Unicode Escapes (#1355) --- src/ast/value.rs | 40 +++++++++++++++++++ src/dialect/generic.rs | 4 ++ src/dialect/mod.rs | 15 +++++++ src/dialect/postgresql.rs | 4 ++ src/parser/mod.rs | 7 ++++ src/tokenizer.rs | 78 +++++++++++++++++++++++++++++++++++++ tests/sqlparser_postgres.rs | 32 +++++++++++++++ 7 files changed, 180 insertions(+) diff --git a/src/ast/value.rs b/src/ast/value.rs index 4c1a56a92..17cdb839d 100644 --- a/src/ast/value.rs +++ b/src/ast/value.rs @@ -52,6 +52,10 @@ pub enum Value { /// See [Postgres docs](https://www.postgresql.org/docs/8.3/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS) /// for more details. EscapedStringLiteral(String), + /// u&'string value' (postgres extension) + /// See [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE) + /// for more details. + UnicodeStringLiteral(String), /// B'string value' SingleQuotedByteStringLiteral(String), /// B"string value" @@ -102,6 +106,7 @@ impl fmt::Display for Value { } Value::DollarQuotedString(v) => write!(f, "{v}"), Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)), + Value::UnicodeStringLiteral(v) => write!(f, "U&'{}'", escape_unicode_string(v)), Value::NationalStringLiteral(v) => write!(f, "N'{v}'"), Value::HexStringLiteral(v) => write!(f, "X'{v}'"), Value::Boolean(v) => write!(f, "{v}"), @@ -347,6 +352,41 @@ pub fn escape_escaped_string(s: &str) -> EscapeEscapedStringLiteral<'_> { EscapeEscapedStringLiteral(s) } +pub struct EscapeUnicodeStringLiteral<'a>(&'a str); + +impl<'a> fmt::Display for EscapeUnicodeStringLiteral<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for c in self.0.chars() { + match c { + '\'' => { + write!(f, "''")?; + } + '\\' => { + write!(f, r#"\\"#)?; + } + x if x.is_ascii() => { + write!(f, "{}", c)?; + } + _ => { + let codepoint = c as u32; + // if the character fits in 32 bits, we can use the \XXXX format + // otherwise, we need to use the \+XXXXXX format + if codepoint <= 0xFFFF { + write!(f, "\\{:04X}", codepoint)?; + } else { + write!(f, "\\+{:06X}", codepoint)?; + } + } + } + } + Ok(()) + } +} + +pub fn escape_unicode_string(s: &str) -> EscapeUnicodeStringLiteral<'_> { + EscapeUnicodeStringLiteral(s) +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs index 8d762d780..2777dfb02 100644 --- a/src/dialect/generic.rs +++ b/src/dialect/generic.rs @@ -35,6 +35,10 @@ impl Dialect for GenericDialect { || ch == '_' } + fn supports_unicode_string_literal(&self) -> bool { + true + } + fn supports_group_by_expr(&self) -> bool { true } diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 3ff7bb2a5..22e0baeb2 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -145,6 +145,21 @@ pub trait Dialect: Debug + Any { fn supports_string_literal_backslash_escape(&self) -> bool { false } + + /// Determine if the dialect supports string literals with `U&` prefix. + /// This is used to specify Unicode code points in string literals. + /// For example, in PostgreSQL, the following is a valid string literal: + /// ```sql + /// SELECT U&'\0061\0062\0063'; + /// ``` + /// This is equivalent to the string literal `'abc'`. + /// See + /// - [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE) + /// - [H2 docs](http://www.h2database.com/html/grammar.html#string) + fn supports_unicode_string_literal(&self) -> bool { + false + } + /// Does the dialect support `FILTER (WHERE expr)` for aggregate queries? fn supports_filter_during_aggregation(&self) -> bool { false diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index 0e04bfa27..8254e807b 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -40,6 +40,10 @@ impl Dialect for PostgreSqlDialect { ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_' } + fn supports_unicode_string_literal(&self) -> bool { + true + } + /// See fn is_custom_operator_part(&self, ch: char) -> bool { matches!( diff --git a/src/parser/mod.rs b/src/parser/mod.rs index b3120bb30..2b1c1ab7f 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1191,6 +1191,10 @@ impl<'a> Parser<'a> { self.prev_token(); Ok(Expr::Value(self.parse_value()?)) } + Token::UnicodeStringLiteral(_) => { + self.prev_token(); + Ok(Expr::Value(self.parse_value()?)) + } Token::Number(_, _) | Token::SingleQuotedString(_) | Token::DoubleQuotedString(_) @@ -1868,6 +1872,7 @@ impl<'a> Parser<'a> { } Token::SingleQuotedString(_) | Token::EscapedStringLiteral(_) + | Token::UnicodeStringLiteral(_) | Token::NationalStringLiteral(_) | Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)), _ => self.expected( @@ -6965,6 +6970,7 @@ impl<'a> Parser<'a> { } Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())), Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())), + Token::UnicodeStringLiteral(ref s) => Ok(Value::UnicodeStringLiteral(s.to_string())), Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())), Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())), tok @ Token::Colon | tok @ Token::AtSign => { @@ -7056,6 +7062,7 @@ impl<'a> Parser<'a> { Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => { Ok(s) } + Token::UnicodeStringLiteral(s) => Ok(s), _ => self.expected("literal string", next_token), } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index b8336cec8..be11a3140 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -94,6 +94,8 @@ pub enum Token { NationalStringLiteral(String), /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second' EscapedStringLiteral(String), + /// Unicode string literal: i.e: U&'first \000A second' + UnicodeStringLiteral(String), /// Hexadecimal string literal: i.e.: X'deadbeef' HexStringLiteral(String), /// Comma @@ -251,6 +253,7 @@ impl fmt::Display for Token { Token::DollarQuotedString(ref s) => write!(f, "{s}"), Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"), Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"), + Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"), Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"), Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"), Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"), @@ -794,6 +797,23 @@ impl<'a> Tokenizer<'a> { } } } + // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL + x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => { + chars.next(); // consume, to check the next char + if chars.peek() == Some(&'&') { + // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier + let mut chars_clone = chars.peekable.clone(); + chars_clone.next(); // consume the '&' in the clone + if chars_clone.peek() == Some(&'\'') { + chars.next(); // consume the '&' in the original iterator + let s = unescape_unicode_single_quoted_string(chars)?; + return Ok(Some(Token::UnicodeStringLiteral(s))); + } + } + // regular identifier starting with an "U" or "u" + let s = self.tokenize_word(x, chars); + Ok(Some(Token::make_word(&s, None))) + } // The spec only allows an uppercase 'X' to introduce a hex // string, but PostgreSQL, at least, allows a lowercase 'x' too. x @ 'x' | x @ 'X' => { @@ -1797,6 +1817,64 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> { } } +fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result { + let mut unescaped = String::new(); + chars.next(); // consume the opening quote + while let Some(c) = chars.next() { + match c { + '\'' => { + if chars.peek() == Some(&'\'') { + chars.next(); + unescaped.push('\''); + } else { + return Ok(unescaped); + } + } + '\\' => match chars.peek() { + Some('\\') => { + chars.next(); + unescaped.push('\\'); + } + Some('+') => { + chars.next(); + unescaped.push(take_char_from_hex_digits(chars, 6)?); + } + _ => unescaped.push(take_char_from_hex_digits(chars, 4)?), + }, + _ => { + unescaped.push(c); + } + } + } + Err(TokenizerError { + message: "Unterminated unicode encoded string literal".to_string(), + location: chars.location(), + }) +} + +fn take_char_from_hex_digits( + chars: &mut State<'_>, + max_digits: usize, +) -> Result { + let mut result = 0u32; + for _ in 0..max_digits { + let next_char = chars.next().ok_or_else(|| TokenizerError { + message: "Unexpected EOF while parsing hex digit in escaped unicode string." + .to_string(), + location: chars.location(), + })?; + let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError { + message: format!("Invalid hex digit in escaped unicode string: {}", next_char), + location: chars.location(), + })?; + result = result * 16 + digit; + } + char::from_u32(result).ok_or_else(|| TokenizerError { + message: format!("Invalid unicode character: {:x}", result), + location: chars.location(), + }) +} + #[cfg(test)] mod tests { use super::*; diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 5ac421da0..44231e7d3 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -4441,3 +4441,35 @@ fn test_table_unnest_with_ordinality() { _ => panic!("Expecting TableFactor::UNNEST with ordinality"), } } + +#[test] +fn test_escaped_string_literal() { + match pg().verified_expr(r#"E'\n'"#) { + Expr::Value(Value::EscapedStringLiteral(s)) => { + assert_eq!("\n", s); + } + _ => unreachable!(), + } +} + +#[test] +fn test_unicode_string_literal() { + let pairs = [ + // Example from the postgres docs + (r#"U&'\0441\043B\043E\043D'"#, "слон"), + // High unicode code point (> 0xFFFF) + (r#"U&'\+01F418'"#, "🐘"), + // Escaped backslash + (r#"U&'\\'"#, r#"\"#), + // Escaped single quote + (r#"U&''''"#, "'"), + ]; + for (input, expected) in pairs { + match pg_and_generic().verified_expr(input) { + Expr::Value(Value::UnicodeStringLiteral(s)) => { + assert_eq!(expected, s); + } + _ => unreachable!(), + } + } +}