From b7f265a4590049f8274cef8411f63eddb5b4bf87 Mon Sep 17 00:00:00 2001 From: waralexrom <108349432+waralexrom@users.noreply.github.com> Date: Wed, 14 Feb 2024 20:09:29 +0300 Subject: [PATCH] feat: Strings with Unicode Escapes (#43) --- src/ast/value.rs | 3 +++ src/parser.rs | 6 ++++- src/tokenizer.rs | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/src/ast/value.rs b/src/ast/value.rs index c7b4817c3..4290c1f55 100644 --- a/src/ast/value.rs +++ b/src/ast/value.rs @@ -42,6 +42,8 @@ pub enum Value { NationalStringLiteral(String), /// X'hex value' HexStringLiteral(String), + /// U&'hex value' + UnicodeEscapedStringLiteral(String), DoubleQuotedString(String), /// Boolean value true or false @@ -78,6 +80,7 @@ impl fmt::Display for Value { Value::DoubleQuotedString(v) => write!(f, "\"{}\"", v), Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)), Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)), + Value::UnicodeEscapedStringLiteral(v) => write!(f, "U&'{}'", escape_escaped_string(v)), Value::NationalStringLiteral(v) => write!(f, "N'{}'", v), Value::HexStringLiteral(v) => write!(f, "X'{}'", v), Value::Boolean(v) => write!(f, "{}", v), diff --git a/src/parser.rs b/src/parser.rs index b9860748c..e8c9822dc 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -539,7 +539,7 @@ impl<'a> Parser<'a> { expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?), }) } - Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => + Token::EscapedStringLiteral(_) | Token::UnicodeEscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => { self.prev_token(); Ok(Expr::Value(self.parse_value()?)) @@ -971,6 +971,7 @@ impl<'a> Parser<'a> { Token::SingleQuotedString(_) | Token::EscapedStringLiteral(_) | Token::NationalStringLiteral(_) + | Token::UnicodeEscapedStringLiteral(_) | Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)), unexpected => { self.expected("either filler, WITH, or WITHOUT in LISTAGG", unexpected)? @@ -2893,6 +2894,9 @@ impl<'a> Parser<'a> { Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())), Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())), Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())), + Token::UnicodeEscapedStringLiteral(ref s) => { + Ok(Value::UnicodeEscapedStringLiteral(s.to_string())) + } Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())), Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())), unexpected => self.expected("a value", unexpected), diff --git a/src/tokenizer.rs b/src/tokenizer.rs index f9a4e40cf..91fabd628 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -55,6 +55,8 @@ pub enum Token { EscapedStringLiteral(String), /// Hexadecimal string literal: i.e.: X'deadbeef' HexStringLiteral(String), + /// Unicode escaped string: U&'d\0061t\+000061' (data) + UnicodeEscapedStringLiteral(String), /// Comma Comma, /// Whitespace (space, tab, etc) @@ -156,6 +158,7 @@ impl fmt::Display for Token { Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s), Token::EscapedStringLiteral(ref s) => write!(f, "E'{}'", s), Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s), + Token::UnicodeEscapedStringLiteral(ref s) => write!(f, "U&'{}'", s), Token::Comma => f.write_str(","), Token::Whitespace(ws) => write!(f, "{}", ws), Token::DoubleEq => f.write_str("=="), @@ -415,6 +418,28 @@ impl<'a> Tokenizer<'a> { } } } + x @ 'u' | x @ 'U' => { + chars.next(); // consume, to check the next char + let mut look_ahead_chars = chars.clone(); + if look_ahead_chars.next_if_eq(&'&').is_some() { + match look_ahead_chars.peek() { + Some('\'') => { + //Move chars to the position of look_ahead_chars + chars.next(); + // U&'...' - a + let s = self.tokenize_single_quoted_string(chars)?; + Ok(Some(Token::UnicodeEscapedStringLiteral(s))) + } + _ => { + let s = self.tokenize_word(x, chars); + Ok(Some(Token::make_word(&s, None))) + } + } + } else { + let s = self.tokenize_word(x, chars); + Ok(Some(Token::make_word(&s, None))) + } + } // identifier or keyword ch if self.dialect.is_identifier_start(ch) => { chars.next(); // consume the first char @@ -1417,4 +1442,36 @@ mod tests { //println!("------------------------------"); assert_eq!(expected, actual); } + #[test] + fn tokenize_unicode_escaped_literal() { + let sql = r#"U&'aaa'"#; + let dialect = GenericDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, sql); + let tokens = tokenizer.tokenize().unwrap(); + let expected = vec![Token::UnicodeEscapedStringLiteral("aaa".to_string())]; + compare(expected, tokens); + + let sql = r#"U&a"#; + let dialect = GenericDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, sql); + let tokens = tokenizer.tokenize().unwrap(); + let expected = vec![ + Token::make_word("U", None), + Token::Ampersand, + Token::make_word("a", None), + ]; + compare(expected, tokens); + let sql = r#"U & 'aaa'"#; + let dialect = GenericDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, sql); + let tokens = tokenizer.tokenize().unwrap(); + let expected = vec![ + Token::make_word("U", None), + Token::Whitespace(Whitespace::Space), + Token::Ampersand, + Token::Whitespace(Whitespace::Space), + Token::SingleQuotedString("aaa".to_string()), + ]; + compare(expected, tokens); + } }