Skip to content

Commit

Permalink
feat: Strings with Unicode Escapes (#43)
Browse files Browse the repository at this point in the history
Can drop this after rebase on commit bc15f7b "Support for postgres String Constants with Unicode Escapes (apache#1355)", first released in 0.50.0
  • Loading branch information
waralexrom authored and mcheshkov committed Sep 2, 2024
1 parent 96d13ea commit 5420201
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 1 deletion.
3 changes: 3 additions & 0 deletions src/ast/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ pub enum Value {
NationalStringLiteral(String),
/// X'hex value'
HexStringLiteral(String),
/// U&'hex value'
UnicodeEscapedStringLiteral(String),

DoubleQuotedString(String),
/// Boolean value true or false
Expand Down Expand Up @@ -75,6 +77,7 @@ impl fmt::Display for Value {
Value::DoubleQuotedString(v) => write!(f, "\"{}\"", v),
Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)),
Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
Value::UnicodeEscapedStringLiteral(v) => write!(f, "U&'{}'", escape_escaped_string(v)),
Value::NationalStringLiteral(v) => write!(f, "N'{}'", v),
Value::HexStringLiteral(v) => write!(f, "X'{}'", v),
Value::Boolean(v) => write!(f, "{}", v),
Expand Down
6 changes: 5 additions & 1 deletion src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ impl<'a> Parser<'a> {
expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?),
})
}
Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
Token::EscapedStringLiteral(_) | Token::UnicodeEscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
{
self.prev_token();
Ok(Expr::Value(self.parse_value()?))
Expand Down Expand Up @@ -956,6 +956,7 @@ impl<'a> Parser<'a> {
Token::SingleQuotedString(_)
| Token::EscapedStringLiteral(_)
| Token::NationalStringLiteral(_)
| Token::UnicodeEscapedStringLiteral(_)
| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
unexpected => {
self.expected("either filler, WITH, or WITHOUT in LISTAGG", unexpected)?
Expand Down Expand Up @@ -2888,6 +2889,9 @@ impl<'a> Parser<'a> {
Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
Token::UnicodeEscapedStringLiteral(ref s) => {
Ok(Value::UnicodeEscapedStringLiteral(s.to_string()))
}
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
unexpected => self.expected("a value", unexpected),
Expand Down
57 changes: 57 additions & 0 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ pub enum Token {
EscapedStringLiteral(String),
/// Hexadecimal string literal: i.e.: X'deadbeef'
HexStringLiteral(String),
/// Unicode escaped string: U&'d\0061t\+000061' (data)
UnicodeEscapedStringLiteral(String),
/// Comma
Comma,
/// Whitespace (space, tab, etc)
Expand Down Expand Up @@ -164,6 +166,7 @@ impl fmt::Display for Token {
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
Token::EscapedStringLiteral(ref s) => write!(f, "E'{}'", s),
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
Token::UnicodeEscapedStringLiteral(ref s) => write!(f, "U&'{}'", s),
Token::Comma => f.write_str(","),
Token::Whitespace(ws) => write!(f, "{}", ws),
Token::DoubleEq => f.write_str("=="),
Expand Down Expand Up @@ -427,6 +430,28 @@ impl<'a> Tokenizer<'a> {
}
}
}
x @ 'u' | x @ 'U' => {
chars.next(); // consume, to check the next char
let mut look_ahead_chars = chars.clone();
if look_ahead_chars.next_if_eq(&'&').is_some() {
match look_ahead_chars.peek() {
Some('\'') => {
//Move chars to the position of look_ahead_chars
chars.next();
// U&'...' - a <binary string literal>
let s = self.tokenize_single_quoted_string(chars)?;
Ok(Some(Token::UnicodeEscapedStringLiteral(s)))
}
_ => {
let s = self.tokenize_word(x, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
} else {
let s = self.tokenize_word(x, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
// identifier or keyword
ch if self.dialect.is_identifier_start(ch) => {
chars.next(); // consume the first char
Expand Down Expand Up @@ -1454,4 +1479,36 @@ mod tests {
//println!("------------------------------");
assert_eq!(expected, actual);
}
#[test]
fn tokenize_unicode_escaped_literal() {
let sql = r#"U&'aaa'"#;
let dialect = GenericDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![Token::UnicodeEscapedStringLiteral("aaa".to_string())];
compare(expected, tokens);

let sql = r#"U&a"#;
let dialect = GenericDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::make_word("U", None),
Token::Ampersand,
Token::make_word("a", None),
];
compare(expected, tokens);
let sql = r#"U & 'aaa'"#;
let dialect = GenericDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::make_word("U", None),
Token::Whitespace(Whitespace::Space),
Token::Ampersand,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("aaa".to_string()),
];
compare(expected, tokens);
}
}

0 comments on commit 5420201

Please sign in to comment.