diff --git a/crates/apollo-parser/CHANGELOG.md b/crates/apollo-parser/CHANGELOG.md index 54b52e0de..5e74ff57d 100644 --- a/crates/apollo-parser/CHANGELOG.md +++ b/crates/apollo-parser/CHANGELOG.md @@ -19,7 +19,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm # [x.x.x] (unreleased) - 2023-mm-dd ## Fixes -- **apply recursion limit where needed, reduce its default from 4096 to 500 [SimonSapin], [pull/662]** +- **apply recursion limit where needed, reduce its default from 4096 to 500 - [SimonSapin], [pull/662]** The limit was only tracked for nested selection sets, but the parser turns out to use recursion in other cases too. [Issue 666] tracks reducing them. Stack overflow was observed with little more than 2000 @@ -28,9 +28,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - **fix various lexer bugs - [SimonSapin], [pull/646], [pull/652]** The lexer was too permissive in emitting tokens instead of errors in various cases around numbers, strings, and EOF. +- **fix panic on surrogate code points in unicode escape sequences - [SimonSapin], [issue/608], [pull/658]** +[issue/608]: https://github.com/apollographql/apollo-rs/issues/608 [pull/646]: https://github.com/apollographql/apollo-rs/pull/646 [pull/652]: https://github.com/apollographql/apollo-rs/pull/652 +[pull/658]: https://github.com/apollographql/apollo-rs/pull/658 [pull/662]: https://github.com/apollographql/apollo-rs/pull/662 [Issue 666]: https://github.com/apollographql/apollo-rs/issues/666 diff --git a/crates/apollo-parser/src/ast/node_ext.rs b/crates/apollo-parser/src/ast/node_ext.rs index e6fe49d33..f9541f1fb 100644 --- a/crates/apollo-parser/src/ast/node_ext.rs +++ b/crates/apollo-parser/src/ast/node_ext.rs @@ -167,6 +167,8 @@ fn unescape_string(input: &str) -> String { break; }; + // TODO: https://github.com/apollographql/apollo-rs/issues/657 needs + // changes both here and in `lexer/mod.rs` let mut unicode = || { // 1. Let value be the 16-bit hexadecimal value represented // by the sequence of hexadecimal digits within EscapedUnicode. diff --git a/crates/apollo-parser/src/lexer/cursor.rs b/crates/apollo-parser/src/lexer/cursor.rs index 591e573d7..71cb77e26 100644 --- a/crates/apollo-parser/src/lexer/cursor.rs +++ b/crates/apollo-parser/src/lexer/cursor.rs @@ -6,8 +6,8 @@ use crate::Error; #[derive(Debug, Clone)] pub(crate) struct Cursor<'a> { index: usize, - offset: usize, - source: &'a str, + pub(super) offset: usize, + pub(super) source: &'a str, chars: CharIndices<'a>, pending: Option, pub(crate) err: Option, diff --git a/crates/apollo-parser/src/lexer/mod.rs b/crates/apollo-parser/src/lexer/mod.rs index 2b3c4351a..56f47fc59 100644 --- a/crates/apollo-parser/src/lexer/mod.rs +++ b/crates/apollo-parser/src/lexer/mod.rs @@ -326,7 +326,24 @@ impl<'a> Cursor<'a> { _ => { if remaining <= 1 { state = State::StringLiteral; - + let hex_end = self.offset + 1; + let hex_start = hex_end - 4; + let hex = &self.source[hex_start..hex_end]; + // `is_ascii_hexdigit()` checks in previous iterations ensures + // this `unwrap()` does not panic: + let code_point = u32::from_str_radix(hex, 16).unwrap(); + if char::from_u32(code_point).is_none() { + // TODO: https://github.com/apollographql/apollo-rs/issues/657 needs + // changes both here and in `ast/node_ext.rs` + let escape_sequence_start = hex_start - 2; // include "\u" + let escape_sequence = &self.source[escape_sequence_start..hex_end]; + self.add_err(Error::new( + "surrogate code point is invalid in unicode escape sequence \ + (paired surrogate not supported yet: \ + https://github.com/apollographql/apollo-rs/issues/657)", + escape_sequence.to_owned(), + )); + } continue; } diff --git a/crates/apollo-parser/test_data/lexer/err/0030_escaped_surrogate.graphql b/crates/apollo-parser/test_data/lexer/err/0030_escaped_surrogate.graphql new file mode 100644 index 000000000..ec4d11971 --- /dev/null +++ b/crates/apollo-parser/test_data/lexer/err/0030_escaped_surrogate.graphql @@ -0,0 +1,12 @@ +# TODO: move these cases back to ok/0004_string_value.graphql when +# https://github.com/apollographql/apollo-rs/issues/657 is implemented +"string with unicode surrogate pair escape \uD83D\uDE00" +"string with minimal surrogate pair escape \uD800\uDC00" +"string with maximal surrogate pair escape \uDBFF\uDFFF" + +# TODO: emit two errors: https://github.com/apollographql/apollo-rs/issues/319 +"split pair \uD83D \uDE00" + +"Backwards pair \uDE00\uD83D" +"Lone lead surrogate \uD83E" +"Lone trail surrogate \uDD80" diff --git a/crates/apollo-parser/test_data/lexer/err/0030_escaped_surrogate.txt b/crates/apollo-parser/test_data/lexer/err/0030_escaped_surrogate.txt new file mode 100644 index 000000000..f865bf767 --- /dev/null +++ b/crates/apollo-parser/test_data/lexer/err/0030_escaped_surrogate.txt @@ -0,0 +1,21 @@ +COMMENT@0:66 "# TODO: move these cases back to ok/0004_string_value.graphql when" +WHITESPACE@66:67 "\n" +COMMENT@67:137 "# https://github.com/apollographql/apollo-rs/issues/657 is implemented" +WHITESPACE@137:138 "\n" +ERROR@138:194 "surrogate code point is invalid in unicode escape sequence (paired surrogate not supported yet: https://github.com/apollographql/apollo-rs/issues/657)" "string with unicode surrogate pair escape \uD83D\uDE00" +WHITESPACE@194:195 "\n" +ERROR@195:251 "surrogate code point is invalid in unicode escape sequence (paired surrogate not supported yet: https://github.com/apollographql/apollo-rs/issues/657)" "string with minimal surrogate pair escape \uD800\uDC00" +WHITESPACE@251:252 "\n" +ERROR@252:308 "surrogate code point is invalid in unicode escape sequence (paired surrogate not supported yet: https://github.com/apollographql/apollo-rs/issues/657)" "string with maximal surrogate pair escape \uDBFF\uDFFF" +WHITESPACE@308:310 "\n\n" +COMMENT@310:388 "# TODO: emit two errors: https://github.com/apollographql/apollo-rs/issues/319" +WHITESPACE@388:389 "\n" +ERROR@389:415 "surrogate code point is invalid in unicode escape sequence (paired surrogate not supported yet: https://github.com/apollographql/apollo-rs/issues/657)" "split pair \uD83D \uDE00" +WHITESPACE@415:417 "\n\n" +ERROR@417:446 "surrogate code point is invalid in unicode escape sequence (paired surrogate not supported yet: https://github.com/apollographql/apollo-rs/issues/657)" "Backwards pair \uDE00\uD83D" +WHITESPACE@446:447 "\n" +ERROR@447:475 "surrogate code point is invalid in unicode escape sequence (paired surrogate not supported yet: https://github.com/apollographql/apollo-rs/issues/657)" "Lone lead surrogate \uD83E" +WHITESPACE@475:476 "\n" +ERROR@476:505 "surrogate code point is invalid in unicode escape sequence (paired surrogate not supported yet: https://github.com/apollographql/apollo-rs/issues/657)" "Lone trail surrogate \uDD80" +WHITESPACE@505:506 "\n" +EOF@506:506 diff --git a/crates/apollo-parser/test_data/lexer/ok/0004_string_value.graphql b/crates/apollo-parser/test_data/lexer/ok/0004_string_value.graphql index 0b036c7e9..fe191df7e 100644 --- a/crates/apollo-parser/test_data/lexer/ok/0004_string_value.graphql +++ b/crates/apollo-parser/test_data/lexer/ok/0004_string_value.graphql @@ -2,9 +2,6 @@ "simple" " white space " "unicode \u1234\u5678\u90AB\uCDEF" -"string with unicode surrogate pair escape \uD83D\uDE00" -"string with minimal surrogate pair escape \uD800\uDC00" -"string with maximal surrogate pair escape \uDBFF\uDFFF" "string with \"escaped\" characters" "string with multiple languages котя, 猫, ねこ, قطة" """ diff --git a/crates/apollo-parser/test_data/lexer/ok/0004_string_value.txt b/crates/apollo-parser/test_data/lexer/ok/0004_string_value.txt index 7611cc2f1..6fe6a9bbb 100644 --- a/crates/apollo-parser/test_data/lexer/ok/0004_string_value.txt +++ b/crates/apollo-parser/test_data/lexer/ok/0004_string_value.txt @@ -6,16 +6,10 @@ STRING_VALUE@12:27 "\" white space \"" WHITESPACE@27:28 "\n" STRING_VALUE@28:62 "\"unicode \\u1234\\u5678\\u90AB\\uCDEF\"" WHITESPACE@62:63 "\n" -STRING_VALUE@63:119 "\"string with unicode surrogate pair escape \\uD83D\\uDE00\"" -WHITESPACE@119:120 "\n" -STRING_VALUE@120:176 "\"string with minimal surrogate pair escape \\uD800\\uDC00\"" -WHITESPACE@176:177 "\n" -STRING_VALUE@177:233 "\"string with maximal surrogate pair escape \\uDBFF\\uDFFF\"" -WHITESPACE@233:234 "\n" -STRING_VALUE@234:270 "\"string with \\\"escaped\\\" characters\"" -WHITESPACE@270:271 "\n" -STRING_VALUE@271:333 "\"string with multiple languages котя, 猫, ねこ, قطة\"" -WHITESPACE@333:334 "\n" -STRING_VALUE@334:421 "\"\"\"\nblock string with unusual whitespaces\na b c\nd\n\ne\tf\ng\u{2028}\u{2029}h\ni\u{b}j\u{c}k\u{feff}l\u{85}\u{200e}\u{200f}m\n\"\"\"" -WHITESPACE@421:422 "\n" -EOF@422:422 +STRING_VALUE@63:99 "\"string with \\\"escaped\\\" characters\"" +WHITESPACE@99:100 "\n" +STRING_VALUE@100:162 "\"string with multiple languages котя, 猫, ねこ, قطة\"" +WHITESPACE@162:163 "\n" +STRING_VALUE@163:250 "\"\"\"\nblock string with unusual whitespaces\na b c\nd\n\ne\tf\ng\u{2028}\u{2029}h\ni\u{b}j\u{c}k\u{feff}l\u{85}\u{200e}\u{200f}m\n\"\"\"" +WHITESPACE@250:251 "\n" +EOF@251:251