Skip to content

Commit

Permalink
Split out the interpolation tokens in a raw_str
Browse files Browse the repository at this point in the history
The syntax highlighting Xcode does is #"abc as String, \#( as Plain Text,
value depending on the variables scope, ) as Plain Text and ."# as String.

In order for us to do the same, we need to separate out the `\#(` from
the prior `raw_str_part` (and the same with the closing parenthesis).
This requires a slightly complicated dance between the grammar and the
custom scanner, where the scanner validates the `\#(` but does not parse
it, and the grammar parses it but does not validate it. In turn, the
closing parenthesis gets consumed as a separate token by the grammar but
must leave some remnant behind in the form of a synthetic external rule
that just signals its position.
  • Loading branch information
alex-pinkus committed Oct 13, 2021
1 parent 90bc6e8 commit c152d39
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 19 deletions.
30 changes: 20 additions & 10 deletions corpus/literals.txt
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,13 @@ extension URL {
(control_transfer_statement
(raw_string_literal
(raw_str_part)
(simple_identifier)
(raw_str_interpolation
(raw_str_interpolation_start)
(simple_identifier))
(raw_str_part)
(simple_identifier)
(raw_str_interpolation
(raw_str_interpolation_start)
(simple_identifier))
(raw_str_end_part)))))))))

==================
Expand All @@ -242,7 +246,7 @@ print(#"Hello \#(world /* commented out)"#) */ )"#)

let _ = ##"Multiple pound signs \##(interpolated): still one part "# not done yet "##
let _ = ##"Fake \#(interpolation) and unused # pound signs "##
let _ = ##"\##(a)\#(b)\##(c)\###(d)\##(e)\##"##
let _ = ##"\##(a)\#(b)\##(c)\#(d)"# ##"##

---

Expand All @@ -254,16 +258,20 @@ let _ = ##"\##(a)\#(b)\##(c)\###(d)\##(e)\##"##
(value_argument
(raw_string_literal
(raw_str_part)
(simple_identifier)
(multiline_comment)
(raw_str_interpolation
(raw_str_interpolation_start)
(simple_identifier)
(multiline_comment))
(raw_str_end_part))))))
(property_declaration
(value_binding_pattern
(non_binding_pattern
(wildcard_pattern)))
(raw_string_literal
(raw_str_part)
(simple_identifier)
(raw_str_interpolation
(raw_str_interpolation_start)
(simple_identifier))
(raw_str_end_part)))
(property_declaration
(value_binding_pattern
Expand All @@ -277,9 +285,11 @@ let _ = ##"\##(a)\#(b)\##(c)\###(d)\##(e)\##"##
(wildcard_pattern)))
(raw_string_literal
(raw_str_part)
(simple_identifier)
(raw_str_part)
(simple_identifier)
(raw_str_interpolation
(raw_str_interpolation_start)
(simple_identifier))
(raw_str_part)
(simple_identifier)
(raw_str_interpolation
(raw_str_interpolation_start)
(simple_identifier))
(raw_str_end_part))))
17 changes: 16 additions & 1 deletion grammar.js
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ module.exports = grammar({
externals: ($) => [
$.multiline_comment,
$.raw_str_part,
$.raw_str_continuing_indicator,
$.raw_str_end_part,
$._semi,
$._arrow_operator,
Expand Down Expand Up @@ -202,7 +203,21 @@ module.exports = grammar({
),

raw_string_literal: ($) =>
seq(repeat(seq($.raw_str_part, $._expression)), $.raw_str_end_part),
seq(
repeat(
seq(
$.raw_str_part,
$.raw_str_interpolation,
optional($.raw_str_continuing_indicator)
)
),
$.raw_str_end_part
),

raw_str_interpolation: ($) =>
seq($.raw_str_interpolation_start, $._expression, ")"),

raw_str_interpolation_start: ($) => /\\#*\(/,

_multi_line_string_content: ($) =>
choice($._multi_line_str_text, $._escaped_identifier, '"'),
Expand Down
34 changes: 26 additions & 8 deletions src/scanner.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
enum TokenType {
BLOCK_COMMENT,
RAW_STR_PART,
RAW_STR_CONTINUING_INDICATOR,
RAW_STR_END_PART,
SEMI,
ARROW_OPERATOR,
Expand Down Expand Up @@ -281,43 +282,61 @@ static bool eat_raw_str_part(
return false;
}

} else if (lexer->lookahead == ')') {
// This is the end of an interpolation - now it's another raw_str_part
advance(lexer);
} else if (valid_symbols[RAW_STR_CONTINUING_INDICATOR]) {
// This is the end of an interpolation - now it's another raw_str_part. This is a synthetic
// marker to tell us that the grammar just consumed a `(` symbol to close a raw
// interpolation (since we don't want to fire on every `(` in existence). We don't have
// anything to do except continue.
} else {
return false;
}

// We're in a state where anything other than `hash_count` hash symbols in a row should be eaten
// and is part of a string.
// The last character _before_ the hashes will tell us what happens next.
// Matters are also complicated by the fact that we don't want to consume every character we
// visit; if we see a `\#(`, for instance, with the appropriate number of hash symbols, we want
// to end our parsing _before_ that sequence. This allows highlighting tools to treat that as a
// separate token.
while (lexer->lookahead != '\0') {
uint8_t last_char = '\0';
lexer->mark_end(lexer); // We always want to parse thru the start of the string so far
// Advance through anything that isn't a hash symbol, because we want to count those.
while (lexer->lookahead != '#') {
last_char = lexer->lookahead;
advance(lexer);
if (last_char != '\\') {
// Mark a new end, but only if we didn't just advance past a `\` symbol, since we
// don't want to consume that.
lexer->mark_end(lexer);
}
}

// We hit at least one hash - count them and see if they match.
uint32_t current_hash_count = 0;
while (lexer->lookahead == '#' && current_hash_count < hash_count) {
current_hash_count += 1;
advance(lexer);
}

// If we saw exactly the right number of hashes, one of three things is true:
// 1. We're trying to interpolate into this string.
// 2. The string just ended.
// 3. This was just some hash characters doing nothing important.
if (current_hash_count == hash_count) {
if (last_char == '\\' && lexer->lookahead == '(') {
// Interpolation is starting. Advance the lexer to include the parenthesis.
advance(lexer);
// Interpolation case! Don't consume those chars; they get saved for grammar.js.
*symbol_result = RAW_STR_PART;
state->ongoing_raw_str_hash_count = hash_count;
return true;
} else if (last_char == '"') {
// The string is finished! Do not advance, since the character after the `#` is not part
// of the result.
// The string is finished! Mark the end here, on the very last hash symbol.
lexer->mark_end(lexer);
*symbol_result = RAW_STR_END_PART;
state->ongoing_raw_str_hash_count = 0;
return true;
}
// Nothing special happened - let the string continue.
}
}

Expand Down Expand Up @@ -364,7 +383,6 @@ bool tree_sitter_swift_external_scanner_scan(
enum TokenType raw_str_result;
bool saw_raw_str_part = eat_raw_str_part(state, lexer, valid_symbols, &raw_str_result);
if (saw_raw_str_part) {
lexer->mark_end(lexer);
lexer->result_symbol = raw_str_result;
return true;
}
Expand Down

0 comments on commit c152d39

Please sign in to comment.