-
Notifications
You must be signed in to change notification settings - Fork 86
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #209 from amaanq/rewrite-it-in-c
feat: rewrite the scanner in C
- Loading branch information
Showing
5 changed files
with
161 additions
and
167 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
#include <assert.h> | ||
#include <string.h> | ||
#include <tree_sitter/parser.h> | ||
#include <wctype.h> | ||
|
||
enum TokenType { RAW_STRING_DELIMITER, RAW_STRING_CONTENT }; | ||
|
||
#define RAW_STRING_DELIMITER_MAX 16 | ||
|
||
typedef struct { | ||
uint8_t delimiter_length; | ||
wchar_t delimiter[RAW_STRING_DELIMITER_MAX]; | ||
} Scanner; | ||
|
||
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } | ||
|
||
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } | ||
|
||
static inline void reset(Scanner *scanner) { | ||
scanner->delimiter_length = 0; | ||
memset(scanner->delimiter, 0, sizeof(scanner->delimiter)); | ||
} | ||
|
||
static bool scan_raw_string_delimiter(Scanner *scanner, TSLexer *lexer) { | ||
if (scanner->delimiter_length > 0) { | ||
// Closing delimiter: must exactly match the opening delimiter. | ||
// We already checked this when scanning content, but this is how we | ||
// know when to stop. We can't stop at ", because R"""hello""" is valid. | ||
for (int i = 0; i < scanner->delimiter_length; ++i) { | ||
if (lexer->lookahead != scanner->delimiter[i]) | ||
return false; | ||
advance(lexer); | ||
} | ||
reset(scanner); | ||
return true; | ||
} | ||
|
||
// Opening delimiter: record the d-char-sequence up to (. | ||
// d-char is any basic character except parens, backslashes, and spaces. | ||
for (;;) { | ||
if (scanner->delimiter_length >= RAW_STRING_DELIMITER_MAX || | ||
lexer->eof(lexer) || lexer->lookahead == '\\' || | ||
iswspace(lexer->lookahead)) { | ||
return false; | ||
} | ||
if (lexer->lookahead == '(') { | ||
// Rather than create a token for an empty delimiter, we fail and | ||
// let the grammar fall back to a delimiter-less rule. | ||
return scanner->delimiter_length > 0; | ||
} | ||
scanner->delimiter[scanner->delimiter_length++] = lexer->lookahead; | ||
advance(lexer); | ||
} | ||
} | ||
|
||
// Scan the raw string content in R"delimiter(content)delimiter". | ||
static bool scan_raw_string_content(Scanner *scanner, TSLexer *lexer) { | ||
// The progress made through the delimiter since the last ')'. | ||
// The delimiter may not contain ')' so a single counter suffices. | ||
int delimiter_index = -1; | ||
for (;;) { | ||
// If we hit EOF, consider the content to terminate there. | ||
// This forms an incomplete raw_string_literal, and models the code | ||
// well. | ||
if (lexer->eof(lexer)) { | ||
lexer->mark_end(lexer); | ||
return true; | ||
} | ||
|
||
if (delimiter_index >= 0) { | ||
if (delimiter_index == scanner->delimiter_length) { | ||
if (lexer->lookahead == '"') { | ||
return true; | ||
} else { | ||
delimiter_index = -1; | ||
} | ||
} else { | ||
if (lexer->lookahead == scanner->delimiter[delimiter_index]) { | ||
delimiter_index++; | ||
} else { | ||
delimiter_index = -1; | ||
} | ||
} | ||
} | ||
|
||
if (delimiter_index == -1 && lexer->lookahead == ')') { | ||
// The content doesn't include the )delimiter" part. | ||
// We must still scan through it, but exclude it from the token. | ||
lexer->mark_end(lexer); | ||
delimiter_index = 0; | ||
} | ||
|
||
advance(lexer); | ||
} | ||
} | ||
|
||
static inline bool scan(Scanner *scanner, TSLexer *lexer, | ||
const bool *valid_symbols) { | ||
// No skipping leading whitespace: raw-string grammar is space-sensitive. | ||
|
||
if (valid_symbols[RAW_STRING_DELIMITER]) { | ||
lexer->result_symbol = RAW_STRING_DELIMITER; | ||
return scan_raw_string_delimiter(scanner, lexer); | ||
} | ||
|
||
if (valid_symbols[RAW_STRING_CONTENT]) { | ||
lexer->result_symbol = RAW_STRING_CONTENT; | ||
return scan_raw_string_content(scanner, lexer); | ||
} | ||
|
||
return false; | ||
} | ||
|
||
void *tree_sitter_cpp_external_scanner_create() { | ||
Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner)); | ||
memset(scanner, 0, sizeof(Scanner)); | ||
return scanner; | ||
} | ||
|
||
bool tree_sitter_cpp_external_scanner_scan(void *payload, TSLexer *lexer, | ||
const bool *valid_symbols) { | ||
Scanner *scanner = (Scanner *)payload; | ||
|
||
return scan(scanner, lexer, valid_symbols); | ||
} | ||
|
||
unsigned tree_sitter_cpp_external_scanner_serialize(void *payload, | ||
char *buffer) { | ||
assert(RAW_STRING_DELIMITER_MAX * sizeof(wchar_t) < | ||
TREE_SITTER_SERIALIZATION_BUFFER_SIZE && | ||
"Serialized delimiter is too long!"); | ||
Scanner *scanner = (Scanner *)payload; | ||
size_t size = scanner->delimiter_length * sizeof(wchar_t); | ||
memcpy(buffer, scanner->delimiter, size); | ||
return (unsigned)size; | ||
} | ||
|
||
void tree_sitter_cpp_external_scanner_deserialize(void *payload, | ||
const char *buffer, | ||
unsigned length) { | ||
assert(length % sizeof(wchar_t) == 0 && | ||
"Can't decode serialized delimiter!"); | ||
|
||
Scanner *scanner = (Scanner *)payload; | ||
scanner->delimiter_length = length / sizeof(wchar_t); | ||
memcpy(&scanner->delimiter[0], buffer, length); | ||
} | ||
|
||
void tree_sitter_cpp_external_scanner_destroy(void *payload) { | ||
Scanner *scanner = (Scanner *)payload; | ||
free(scanner); | ||
} |
This file was deleted.
Oops, something went wrong.