Skip to content

Commit

Permalink
Merge pull request #209 from amaanq/rewrite-it-in-c
Browse files Browse the repository at this point in the history
feat: rewrite the scanner in C
  • Loading branch information
aryx authored Jun 19, 2023
2 parents d63acf7 + d8352a8 commit 2e29f23
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 167 deletions.
4 changes: 2 additions & 2 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ let package = Package(
],
sources: [
"src/parser.c",
"src/scanner.cc",
"src/scanner.c",
],
resources: [
.copy("queries")
],
publicHeadersPath: "bindings/swift",
cSettings: [.headerSearchPath("src")])
]
)
)
2 changes: 1 addition & 1 deletion binding.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"sources": [
"src/parser.c",
"bindings/node/binding.cc",
"src/scanner.cc"
"src/scanner.c"
],
"cflags_c": [
"-std=c99",
Expand Down
18 changes: 6 additions & 12 deletions bindings/rust/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,18 @@ fn main() {
let src_dir = std::path::Path::new("src");

let mut c_config = cc::Build::new();
c_config.include(&src_dir);
c_config.include(src_dir);
c_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable")
.flag_if_supported("-Wno-trigraphs");
let parser_path = src_dir.join("parser.c");
c_config.file(&parser_path);
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
c_config.compile("parser");

let mut cpp_config = cc::Build::new();
cpp_config.cpp(true);
cpp_config.include(&src_dir);
cpp_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable");
let scanner_path = src_dir.join("scanner.cc");
cpp_config.file(&scanner_path);
let scanner_path = src_dir.join("scanner.c");
c_config.file(&scanner_path);
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
cpp_config.compile("scanner");

c_config.compile("parser");
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
}
152 changes: 152 additions & 0 deletions src/scanner.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#include <assert.h>
#include <string.h>
#include <tree_sitter/parser.h>
#include <wctype.h>

enum TokenType { RAW_STRING_DELIMITER, RAW_STRING_CONTENT };

#define RAW_STRING_DELIMITER_MAX 16

typedef struct {
uint8_t delimiter_length;
wchar_t delimiter[RAW_STRING_DELIMITER_MAX];
} Scanner;

static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }

static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }

static inline void reset(Scanner *scanner) {
scanner->delimiter_length = 0;
memset(scanner->delimiter, 0, sizeof(scanner->delimiter));
}

static bool scan_raw_string_delimiter(Scanner *scanner, TSLexer *lexer) {
if (scanner->delimiter_length > 0) {
// Closing delimiter: must exactly match the opening delimiter.
// We already checked this when scanning content, but this is how we
// know when to stop. We can't stop at ", because R"""hello""" is valid.
for (int i = 0; i < scanner->delimiter_length; ++i) {
if (lexer->lookahead != scanner->delimiter[i])
return false;
advance(lexer);
}
reset(scanner);
return true;
}

// Opening delimiter: record the d-char-sequence up to (.
// d-char is any basic character except parens, backslashes, and spaces.
for (;;) {
if (scanner->delimiter_length >= RAW_STRING_DELIMITER_MAX ||
lexer->eof(lexer) || lexer->lookahead == '\\' ||
iswspace(lexer->lookahead)) {
return false;
}
if (lexer->lookahead == '(') {
// Rather than create a token for an empty delimiter, we fail and
// let the grammar fall back to a delimiter-less rule.
return scanner->delimiter_length > 0;
}
scanner->delimiter[scanner->delimiter_length++] = lexer->lookahead;
advance(lexer);
}
}

// Scan the raw string content in R"delimiter(content)delimiter".
static bool scan_raw_string_content(Scanner *scanner, TSLexer *lexer) {
// The progress made through the delimiter since the last ')'.
// The delimiter may not contain ')' so a single counter suffices.
int delimiter_index = -1;
for (;;) {
// If we hit EOF, consider the content to terminate there.
// This forms an incomplete raw_string_literal, and models the code
// well.
if (lexer->eof(lexer)) {
lexer->mark_end(lexer);
return true;
}

if (delimiter_index >= 0) {
if (delimiter_index == scanner->delimiter_length) {
if (lexer->lookahead == '"') {
return true;
} else {
delimiter_index = -1;
}
} else {
if (lexer->lookahead == scanner->delimiter[delimiter_index]) {
delimiter_index++;
} else {
delimiter_index = -1;
}
}
}

if (delimiter_index == -1 && lexer->lookahead == ')') {
// The content doesn't include the )delimiter" part.
// We must still scan through it, but exclude it from the token.
lexer->mark_end(lexer);
delimiter_index = 0;
}

advance(lexer);
}
}

static inline bool scan(Scanner *scanner, TSLexer *lexer,
const bool *valid_symbols) {
// No skipping leading whitespace: raw-string grammar is space-sensitive.

if (valid_symbols[RAW_STRING_DELIMITER]) {
lexer->result_symbol = RAW_STRING_DELIMITER;
return scan_raw_string_delimiter(scanner, lexer);
}

if (valid_symbols[RAW_STRING_CONTENT]) {
lexer->result_symbol = RAW_STRING_CONTENT;
return scan_raw_string_content(scanner, lexer);
}

return false;
}

void *tree_sitter_cpp_external_scanner_create() {
Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner));
memset(scanner, 0, sizeof(Scanner));
return scanner;
}

bool tree_sitter_cpp_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload;

return scan(scanner, lexer, valid_symbols);
}

unsigned tree_sitter_cpp_external_scanner_serialize(void *payload,
char *buffer) {
assert(RAW_STRING_DELIMITER_MAX * sizeof(wchar_t) <
TREE_SITTER_SERIALIZATION_BUFFER_SIZE &&
"Serialized delimiter is too long!");
Scanner *scanner = (Scanner *)payload;
size_t size = scanner->delimiter_length * sizeof(wchar_t);
memcpy(buffer, scanner->delimiter, size);
return (unsigned)size;
}

void tree_sitter_cpp_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {
assert(length % sizeof(wchar_t) == 0 &&
"Can't decode serialized delimiter!");

Scanner *scanner = (Scanner *)payload;
scanner->delimiter_length = length / sizeof(wchar_t);
memcpy(&scanner->delimiter[0], buffer, length);
}

void tree_sitter_cpp_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
free(scanner);
}
152 changes: 0 additions & 152 deletions src/scanner.cc

This file was deleted.

0 comments on commit 2e29f23

Please sign in to comment.