Skip to content

Commit

Permalink
Add support for raw strings
Browse files Browse the repository at this point in the history
Uses a custom scanner and ongoing state to parse code like:
```
extension URL {
    func html(withTitle title: String) -> String {
        return #"<a href="\#(absoluteString)">\#(title)</a>"#
    }
}
```

Fixes #7, #11
  • Loading branch information
alex-pinkus committed Oct 10, 2021
1 parent 2dbc19e commit 9b2892c
Show file tree
Hide file tree
Showing 3 changed files with 245 additions and 10 deletions.
110 changes: 110 additions & 0 deletions corpus/literals.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,113 @@ let _ = nil

(source_file
(property_declaration (value_binding_pattern (non_binding_pattern (wildcard_pattern)))))


==================
Raw strings
==================

let _ = #"Hello, world!"#
let _ = ##"Hello, so-called "world"!"##

---

(source_file
(property_declaration
(value_binding_pattern
(non_binding_pattern
(wildcard_pattern)))
(raw_string_literal
(raw_str_end_part)))
(property_declaration
(value_binding_pattern
(non_binding_pattern
(wildcard_pattern)))
(raw_string_literal
(raw_str_end_part))))

==================
Raw strings with interpolation
==================

extension URL {
func html(withTitle title: String) -> String {
return #"<a href="\#(absoluteString)">\#(title)</a>"#
}
}

---

(source_file
(class_declaration
(identifier
(simple_identifier))
(class_body
(function_declaration
(simple_identifier)
(external_parameter_name)
(parameter
(simple_identifier)
(user_type
(type_identifier)))
(user_type
(type_identifier))
(function_body
(statements
(control_transfer_statement
(raw_string_literal
(raw_str_part)
(simple_identifier)
(raw_str_part)
(simple_identifier)
(raw_str_end_part)))))))))

==================
Raw strings interpolation edge cases
==================

print(#"Hello \#(world /* commented out)"#) */ )"#)

let _ = ##"Multiple pound signs \##(interpolated): still one part "# not done yet "##
let _ = ##"Fake \#(interpolation) and unused # pound signs "##
let _ = ##"\##(a)\#(b)\##(c)\###(d)\##(e)\##"##

---

(source_file
(call_expression
(simple_identifier)
(call_suffix
(value_arguments
(value_argument
(raw_string_literal
(raw_str_part)
(simple_identifier)
(multiline_comment)
(raw_str_end_part))))))
(property_declaration
(value_binding_pattern
(non_binding_pattern
(wildcard_pattern)))
(raw_string_literal
(raw_str_part)
(simple_identifier)
(raw_str_end_part)))
(property_declaration
(value_binding_pattern
(non_binding_pattern
(wildcard_pattern)))
(raw_string_literal
(raw_str_end_part)))
(property_declaration
(value_binding_pattern
(non_binding_pattern
(wildcard_pattern)))
(raw_string_literal
(raw_str_part)
(simple_identifier)
(raw_str_part)
(simple_identifier)
(raw_str_part)
(simple_identifier)
(raw_str_end_part))))
14 changes: 10 additions & 4 deletions grammar.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
* SOFTWARE.
*/

// Using an adapted version of https://kotlinlang.org/docs/reference/grammar.html

const PREC = {
NAVIGATION: 13,
MULTIPLICATIVE: 12,
Expand Down Expand Up @@ -99,6 +97,8 @@ module.exports = grammar({

externals: ($) => [
$.multiline_comment,
$.raw_str_part,
$.raw_str_end_part,
$._semi,
$._arrow_operator,
$._dot_operator,
Expand Down Expand Up @@ -175,7 +175,11 @@ module.exports = grammar({
// String literals

_string_literal: ($) =>
choice($.line_string_literal, $.multi_line_string_literal),
choice(
$.line_string_literal,
$.multi_line_string_literal,
$.raw_string_literal
),

line_string_literal: ($) =>
seq('"', repeat(choice($._line_string_content, $._interpolation)), '"'),
Expand All @@ -190,14 +194,16 @@ module.exports = grammar({

_uni_character_literal: ($) => seq("\\", "u", /[0-9a-fA-F]{4}/), // TODO: { }

// TODO: # delimiter (probably use Rust's custom scanner for this)
multi_line_string_literal: ($) =>
seq(
'"""',
repeat(choice($._multi_line_string_content, $._interpolation)),
'"""'
),

raw_string_literal: ($) =>
seq(repeat(seq($.raw_str_part, $._expression)), $.raw_str_end_part),

_multi_line_string_content: ($) =>
choice($._multi_line_str_text, $._escaped_identifier, '"'),

Expand Down
131 changes: 125 additions & 6 deletions src/scanner.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

enum TokenType {
BLOCK_COMMENT,
RAW_STR_PART,
RAW_STR_END_PART,
SEMI,
ARROW_OPERATOR,
DOT_OPERATOR,
Expand Down Expand Up @@ -57,15 +59,51 @@ const enum TokenType CROSS_SEMI_SYMBOLS[CROSS_SEMI_OPERATOR_COUNT] = {
RETHROWS_KEYWORD
};

struct ScannerState {
uint32_t ongoing_raw_str_hash_count;
};

void *tree_sitter_swift_external_scanner_create() {
return NULL;
return malloc(sizeof(struct ScannerState));
}

void tree_sitter_swift_external_scanner_destroy(void *payload) {
free(payload);
}
void tree_sitter_swift_external_scanner_destroy(void *p) {}
void tree_sitter_swift_external_scanner_reset(void *p) {}
unsigned tree_sitter_swift_external_scanner_serialize(void *p, char *buffer) {
return 0;

void tree_sitter_swift_external_scanner_reset(void *payload) {
struct ScannerState *state = (struct ScannerState *)payload;
state->ongoing_raw_str_hash_count = 0;
}

unsigned tree_sitter_swift_external_scanner_serialize(void *payload, char *buffer) {
struct ScannerState *state = (struct ScannerState *)payload;
uint32_t hash_count = state->ongoing_raw_str_hash_count;
buffer[0] = (hash_count >> 24) & 0xff;
buffer[1] = (hash_count >> 16) & 0xff;
buffer[2] = (hash_count >> 8) & 0xff;
buffer[3] = (hash_count) & 0xff;
return 4;
}

void tree_sitter_swift_external_scanner_deserialize(
void *payload,
const char *buffer,
unsigned length
) {
if (length < 4) {
return;
}

uint32_t hash_count = (
(((uint32_t) buffer[0]) << 24) |
(((uint32_t) buffer[1]) << 16) |
(((uint32_t) buffer[2]) << 8) |
(((uint32_t) buffer[3]))
);
struct ScannerState *state = (struct ScannerState *)payload;
state->ongoing_raw_str_hash_count = hash_count;
}
void tree_sitter_swift_external_scanner_deserialize(void *p, const char *b, unsigned n) {}

static void advance(TSLexer *lexer) {
lexer->advance(lexer, false);
Expand Down Expand Up @@ -217,12 +255,83 @@ static bool eat_comment(
return false;
}

static bool eat_raw_str_part(
struct ScannerState *state,
TSLexer *lexer,
const bool *valid_symbols,
enum TokenType *symbol_result
) {
uint32_t hash_count = state->ongoing_raw_str_hash_count;
if (!valid_symbols[RAW_STR_PART]) {
return false;
} else if (hash_count == 0) {
// If this is a raw_str_part, it's the first one - look for hashes
while (lexer->lookahead == '#') {
hash_count += 1;
advance(lexer);
}

if (hash_count == 0) {
return false;
}

if (lexer->lookahead == '"') {
advance(lexer);
} else {
return false;
}

} else if (lexer->lookahead == ')') {
// This is the end of an interpolation - now it's another raw_str_part
advance(lexer);
} else {
return false;
}

// We're in a state where anything other than `hash_count` hash symbols in a row should be eaten
// and is part of a string.
// The last character _before_ the hashes will tell us what happens next.
while (lexer->lookahead != '\n') {
uint8_t last_char = '\0';
while (lexer->lookahead != '#') {
last_char = lexer->lookahead;
advance(lexer);
}

uint32_t current_hash_count = 0;
while (lexer->lookahead == '#') {
current_hash_count += 1;
advance(lexer);
}

if (current_hash_count == hash_count) {
if (last_char == '\\' && lexer->lookahead == '(') {
// Interpolation is starting. Advance the lexer to include the parenthesis.
advance(lexer);
*symbol_result = RAW_STR_PART;
state->ongoing_raw_str_hash_count = hash_count;
return true;
} else if (last_char == '"') {
// The string is finished! Do not advance, since the character after the `#` is not part
// of the result.
*symbol_result = RAW_STR_END_PART;
state->ongoing_raw_str_hash_count = 0;
return true;
}
}
}

return false;
}

bool tree_sitter_swift_external_scanner_scan(
void *payload,
TSLexer *lexer,
const bool *valid_symbols
) {
// Figure out our scanner state
struct ScannerState *state = (struct ScannerState *)payload;

// Consume any whitespace at the start.
enum TokenType semi_result;
bool saw_semi = eat_whitespace(lexer, valid_symbols, &semi_result);
Expand Down Expand Up @@ -250,6 +359,16 @@ bool tree_sitter_swift_external_scanner_scan(
return true;
}

// NOTE: this will consume any `#` characters it sees, even if it does not find a result. Keep
// it at the end so that it doesn't interfere with special literals or selectors!
enum TokenType raw_str_result;
bool saw_raw_str_part = eat_raw_str_part(state, lexer, valid_symbols, &raw_str_result);
if (saw_raw_str_part) {
lexer->mark_end(lexer);
lexer->result_symbol = raw_str_result;
return true;
}

return false;
}

0 comments on commit 9b2892c

Please sign in to comment.