Split out the interpolation tokens in a raw_str

The syntax highlighting Xcode does is #"abc as String, \#( as Plain Text, value depending on the variables scope, ) as Plain Text and ."# as String. In order for us to do the same, we need to separate out the `\#(` from the prior `raw_str_part` (and the same with the closing parenthesis). This requires a slightly complicated dance between the grammar and the custom scanner, where the scanner validates the `\#(` but does not parse it, and the grammar parses it but does not validate it. In turn, the closing parenthesis gets consumed as a separate token by the grammar but must leave some remnant behind in the form of a synthetic external rule that just signals its position.
alex-pinkus · Oct 13, 2021 · c152d39 · c152d39
1 parent 90bc6e8
commit c152d39
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 19 deletions.
diff --git a/corpus/literals.txt b/corpus/literals.txt
@@ -229,9 +229,13 @@ extension URL {
             (control_transfer_statement
               (raw_string_literal
                 (raw_str_part)
-                (simple_identifier)
+                (raw_str_interpolation
+                    (raw_str_interpolation_start)
+                    (simple_identifier))
                 (raw_str_part)
-                (simple_identifier)
+                (raw_str_interpolation
+                    (raw_str_interpolation_start)
+                    (simple_identifier))
                 (raw_str_end_part)))))))))
 
 ==================
@@ -242,7 +246,7 @@ print(#"Hello \#(world /* commented out)"#)  */ )"#)
 
 let _ = ##"Multiple pound signs \##(interpolated): still one part "# not done yet "##
 let _ = ##"Fake \#(interpolation) and unused # pound signs "##
-let _ = ##"\##(a)\#(b)\##(c)\###(d)\##(e)\##"##
+let _ = ##"\##(a)\#(b)\##(c)\#(d)"# ##"##
 
 ---
 
@@ -254,16 +258,20 @@ let _ = ##"\##(a)\#(b)\##(c)\###(d)\##(e)\##"##
         (value_argument
           (raw_string_literal
             (raw_str_part)
-            (simple_identifier)
-            (multiline_comment)
+            (raw_str_interpolation
+                (raw_str_interpolation_start)
+                (simple_identifier)
+                (multiline_comment))
             (raw_str_end_part))))))
   (property_declaration
     (value_binding_pattern
       (non_binding_pattern
         (wildcard_pattern)))
     (raw_string_literal
       (raw_str_part)
-      (simple_identifier)
+      (raw_str_interpolation
+          (raw_str_interpolation_start)
+          (simple_identifier))
       (raw_str_end_part)))
   (property_declaration
     (value_binding_pattern
@@ -277,9 +285,11 @@ let _ = ##"\##(a)\#(b)\##(c)\###(d)\##(e)\##"##
         (wildcard_pattern)))
     (raw_string_literal
       (raw_str_part)
-      (simple_identifier)
-      (raw_str_part)
-      (simple_identifier)
+      (raw_str_interpolation
+          (raw_str_interpolation_start)
+          (simple_identifier))
       (raw_str_part)
-      (simple_identifier)
+      (raw_str_interpolation
+          (raw_str_interpolation_start)
+          (simple_identifier))
       (raw_str_end_part))))
diff --git a/grammar.js b/grammar.js
@@ -98,6 +98,7 @@ module.exports = grammar({
   externals: ($) => [
     $.multiline_comment,
     $.raw_str_part,
+    $.raw_str_continuing_indicator,
     $.raw_str_end_part,
     $._semi,
     $._arrow_operator,
@@ -202,7 +203,21 @@ module.exports = grammar({
       ),
 
     raw_string_literal: ($) =>
-      seq(repeat(seq($.raw_str_part, $._expression)), $.raw_str_end_part),
+      seq(
+        repeat(
+          seq(
+            $.raw_str_part,
+            $.raw_str_interpolation,
+            optional($.raw_str_continuing_indicator)
+          )
+        ),
+        $.raw_str_end_part
+      ),
+
+    raw_str_interpolation: ($) =>
+      seq($.raw_str_interpolation_start, $._expression, ")"),
+
+    raw_str_interpolation_start: ($) => /\\#*\(/,
 
     _multi_line_string_content: ($) =>
       choice($._multi_line_str_text, $._escaped_identifier, '"'),

diff --git a/src/scanner.c b/src/scanner.c
@@ -4,6 +4,7 @@
 enum TokenType {
     BLOCK_COMMENT,
     RAW_STR_PART,
+    RAW_STR_CONTINUING_INDICATOR,
     RAW_STR_END_PART,
     SEMI,
     ARROW_OPERATOR,
@@ -281,43 +282,61 @@ static bool eat_raw_str_part(
             return false;
         }
 
-    } else if (lexer->lookahead == ')') {
-        // This is the end of an interpolation - now it's another raw_str_part
-        advance(lexer);
+    } else if (valid_symbols[RAW_STR_CONTINUING_INDICATOR]) {
+        // This is the end of an interpolation - now it's another raw_str_part. This is a synthetic
+        // marker to tell us that the grammar just consumed a `(` symbol to close a raw
+        // interpolation (since we don't want to fire on every `(` in existence). We don't have
+        // anything to do except continue.
     } else {
         return false;
     }
 
     // We're in a state where anything other than `hash_count` hash symbols in a row should be eaten
     // and is part of a string.
     // The last character _before_ the hashes will tell us what happens next.
+    // Matters are also complicated by the fact that we don't want to consume every character we
+    // visit; if we see a `\#(`, for instance, with the appropriate number of hash symbols, we want
+    // to end our parsing _before_ that sequence. This allows highlighting tools to treat that as a
+    // separate token.
     while (lexer->lookahead != '\0') {
         uint8_t last_char = '\0';
+        lexer->mark_end(lexer); // We always want to parse thru the start of the string so far
+        // Advance through anything that isn't a hash symbol, because we want to count those.
         while (lexer->lookahead != '#') {
             last_char = lexer->lookahead;
             advance(lexer);
+            if (last_char != '\\') {
+                // Mark a new end, but only if we didn't just advance past a `\` symbol, since we
+                // don't want to consume that.
+                lexer->mark_end(lexer);
+            }
         }
 
+        // We hit at least one hash - count them and see if they match.
         uint32_t current_hash_count = 0;
         while (lexer->lookahead == '#' && current_hash_count < hash_count) {
             current_hash_count += 1;
             advance(lexer);
         }
 
+        // If we saw exactly the right number of hashes, one of three things is true:
+        // 1. We're trying to interpolate into this string.
+        // 2. The string just ended.
+        // 3. This was just some hash characters doing nothing important.
         if (current_hash_count == hash_count) {
             if (last_char == '\\' && lexer->lookahead == '(') {
-                // Interpolation is starting. Advance the lexer to include the parenthesis.
-                advance(lexer);
+                // Interpolation case! Don't consume those chars; they get saved for grammar.js.
                 *symbol_result = RAW_STR_PART;
                 state->ongoing_raw_str_hash_count = hash_count;
                 return true;
             } else if (last_char == '"') {
-                // The string is finished! Do not advance, since the character after the `#` is not part
-                // of the result.
+                // The string is finished! Mark the end here, on the very last hash symbol.
+                lexer->mark_end(lexer);
                 *symbol_result = RAW_STR_END_PART;
                 state->ongoing_raw_str_hash_count = 0;
                 return true;
             }
+            // Nothing special happened - let the string continue.
         }
     }
 
@@ -364,7 +383,6 @@ bool tree_sitter_swift_external_scanner_scan(
     enum TokenType raw_str_result;
     bool saw_raw_str_part = eat_raw_str_part(state, lexer, valid_symbols, &raw_str_result);
     if (saw_raw_str_part) {
-        lexer->mark_end(lexer);
         lexer->result_symbol = raw_str_result;
         return true;
     }