From 7303b6c32f8d1814cfec0266f5b62b4c5916d329 Mon Sep 17 00:00:00 2001 From: Alex Pinkus Date: Sat, 3 Sep 2022 14:56:38 -0700 Subject: [PATCH 1/2] Move custom operator handling to `scanner.c` Custom operator rules are better expressed as conditions to track in `eat_operator` vs a series of impenetrable regexes. Also requires a fix for an undiscovered bug where `x[...]!` was not considered to be a legal target for an expression. Since the old logic allowed `+=` to be a "custom operator", we were covering up for that failure by interpreting `x[...]! += y` as a custom infix expression. Fixes #5 --- corpus/expressions.txt | 21 +++++ corpus/functions.txt | 6 +- grammar.js | 44 +++------- src/scanner.c | 192 ++++++++++++++++++++++++++++++++++++++--- 4 files changed, 220 insertions(+), 43 deletions(-) diff --git a/corpus/expressions.txt b/corpus/expressions.txt index e4b7fa6..c203326 100755 --- a/corpus/expressions.txt +++ b/corpus/expressions.txt @@ -1142,3 +1142,24 @@ async(async: async, qos: qos, flags: flags) { (simple_identifier) (call_suffix (value_arguments)))))))) + +================================================================================ +Assigning to the result of a force unwrap +================================================================================ + +stat[lang]! += 1 + +-------------------------------------------------------------------------------- + +(source_file + (assignment + (directly_assignable_expression + (postfix_expression + (call_expression + (simple_identifier) + (call_suffix + (value_arguments + (value_argument + (simple_identifier))))) + (bang))) + (integer_literal))) diff --git a/corpus/functions.txt b/corpus/functions.txt index 077601a..6d327fb 100755 --- a/corpus/functions.txt +++ b/corpus/functions.txt @@ -354,6 +354,8 @@ precedencegroup MyPrecedence { infix operator -=- : MyPrecedence +infix operator • + -------------------------------------------------------------------------------- (source_file @@ -371,7 +373,9 @@ infix operator -=- : MyPrecedence (simple_identifier)))) (operator_declaration (custom_operator) - (simple_identifier))) + (simple_identifier)) + (operator_declaration + (custom_operator))) ================================================================================ Custom operator with another operator as a prefix diff --git a/grammar.js b/grammar.js index cc0d12d..242a447 100644 --- a/grammar.js +++ b/grammar.js @@ -79,22 +79,6 @@ if (tree_sitter_version_supports_emoji()) { LEXICAL_IDENTIFIER = /[_\p{XID_Start}][_\p{XID_Continue}]*/; } -const CUSTOM_OPERATORS = token( - choice( - // https://docs.swift.org/swift-book/ReferenceManual/LexicalStructure.html#ID418 - // This supports a subset of the operators that Swift does but I'm really not concerned about the esoteric ones. - // Someone who wants unicode support can add it. What this does do is: - // * Avoid the reserved operators by saying that certain characters are only available if you don't start with them. - // * Entirely forbid `<` as the last char because it creates ambiguity with type arguments - /[\\<>&?=][\/=\-+!*%<>&|^?~\.]*[\/=\-+!*%>&|^?~]+/, - /[\-+!*%|^~]+[\/=\-+!*%<>&|^?~]*[\/=\-+!*%>&|^?~]+/, - /[\-+!*%|^~\.]+[\/=\-+!*%<>&|^?~\.]*[\/=\-+!*%>&|^?~\.]+/, - /[\/]+[=\-+!*%<>&|^?~]*[=\-+!*%>&|^?~]+/, - /[\/]+[=\-+!*%<>&|^?~\.]*[=\-+!*%>&|^?~\.]+/ - ) -); -// XXX need custom scanner for: -// * Custom operators and `<` for type arguments module.exports = grammar({ name: "swift", conflicts: ($) => [ @@ -115,12 +99,10 @@ module.exports = grammar({ // After a `{` in a function or switch context, it's ambigous whether we're starting a set of local statements or // applying some modifiers to a capture or pattern. [$.modifiers], - // Custom operators get weird special handling for `<` characters in silly stuff like `func =<<<(...)` - [$.custom_operator], - [$._prefix_unary_operator, $._referenceable_operator], // `+(...)` is ambigously either "call the function produced by a reference to the operator `+`" or "use the unary // operator `+` on the result of the parenthetical expression." [$._additive_operator, $._prefix_unary_operator], + [$._referenceable_operator, $._prefix_unary_operator], // `{ [self, b, c] ...` could be a capture list or an array literal depending on what else happens. [$.capture_list_item, $.self_expression], [$.capture_list_item, $._expression], @@ -249,6 +231,7 @@ module.exports = grammar({ $._as_quest_custom, $._as_bang_custom, $._async_keyword_custom, + $._custom_operator, ], inline: ($) => [$._locally_permitted_modifiers], rules: { @@ -658,7 +641,7 @@ module.exports = grammar({ field("rhs", $._expr_hack_at_ternary_binary_suffix) ) ), - custom_operator: ($) => seq(CUSTOM_OPERATORS, optional("<")), + custom_operator: ($) => choice(token(/[\/]+[*]+/), $._custom_operator), // Suffixes navigation_suffix: ($) => seq( @@ -1066,7 +1049,8 @@ module.exports = grammar({ $.navigation_expression, $.call_expression, $.tuple_expression, - $.self_expression + $.self_expression, + $.postfix_expression // Since `x[...]! = y` is legal ), //////////////////////////////// // Statements - https://docs.swift.org/swift-book/ReferenceManual/Statements.html @@ -1423,14 +1407,7 @@ module.exports = grammar({ _non_constructor_function_decl: ($) => seq( "func", - field( - "name", - choice( - $.simple_identifier, - $._referenceable_operator, - $._bitwise_binary_operator - ) - ) + field("name", choice($.simple_identifier, $._referenceable_operator)) ), _referenceable_operator: ($) => choice( @@ -1440,10 +1417,15 @@ module.exports = grammar({ $._multiplicative_operator, $._equality_operator, $._comparison_operator, + $._assignment_and_operator, "++", "--", $.bang, - "~" + "~", + "|", + "^", + "<<", + ">>" ), // Hide the fact that certain symbols come from the custom scanner by aliasing them to their // string variants. This keeps us from having to see them in the syntax tree (which would be @@ -1587,7 +1569,7 @@ module.exports = grammar({ seq( choice("prefix", "infix", "postfix"), "operator", - $.custom_operator, + $._referenceable_operator, optional(seq(":", $.simple_identifier)), optional($.deprecated_operator_declaration_body) ), diff --git a/src/scanner.c b/src/scanner.c index 76dac2c..745a43d 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -29,7 +29,8 @@ enum TokenType { AS_KEYWORD, AS_QUEST, AS_BANG, - ASYNC_KEYWORD + ASYNC_KEYWORD, + CUSTOM_OPERATOR, }; #define OPERATOR_COUNT 22 @@ -116,6 +117,39 @@ const enum TokenType OP_SYMBOLS[OPERATOR_COUNT] = { ASYNC_KEYWORD }; +#define RESERVED_OP_COUNT 28 + +const char* RESERVED_OPS[RESERVED_OP_COUNT] = { + "/", + "=", + "-", + "+", + "!", + "*", + "%", + "<", + ">", + "&", + "|", + "^", + "?", + "~", + ".", + "->", + "/*", + "*/", + "+=", + "-=", + "*=", + "/=", + "%=", + ">>", + "<<", + "++", + "--", + "===" +}; + bool is_cross_semi_token(enum TokenType op) { switch(op) { case ARROW_OPERATOR: @@ -232,6 +266,84 @@ static int32_t encountered_op_count(bool *encountered_operator) { return encountered; } +static bool any_reserved_ops(int8_t *encountered_reserved_ops) { + for (int op_idx = 0; op_idx < RESERVED_OP_COUNT; op_idx++) { + if (encountered_reserved_ops[op_idx] == 2) { + return true; + } + } + + return false; +} + +static bool is_legal_custom_operator( + bool is_first_char, + int32_t first_char, + int32_t cur_char +) { + switch (cur_char) { + case '=': + case '-': + case '+': + case '!': + case '%': + case '<': + case '>': + case '&': + case '|': + case '^': + case '?': + case '~': + return true; + case '.': + // Grammar allows `.` for any operator that starts with `.` + return is_first_char || first_char == '.'; + case '*': + case '/': + // Not listed in the grammar, but `/*` and `//` can't be the start of an operator since they start comments + return is_first_char || first_char != '/'; + default: + if ( + (cur_char >= 0x00A1 && cur_char <= 0x00A7) || + (cur_char == 0x00A9) || + (cur_char == 0x00AB) || + (cur_char == 0x00AC) || + (cur_char == 0x00AE) || + (cur_char >= 0x00B0 && cur_char <= 0x00B1) || + (cur_char == 0x00B6) || + (cur_char == 0x00BB) || + (cur_char == 0x00BF) || + (cur_char == 0x00D7) || + (cur_char == 0x00F7) || + (cur_char >= 0x2016 && cur_char <= 0x2017) || + (cur_char >= 0x2020 && cur_char <= 0x2027) || + (cur_char >= 0x2030 && cur_char <= 0x203E) || + (cur_char >= 0x2041 && cur_char <= 0x2053) || + (cur_char >= 0x2055 && cur_char <= 0x205E) || + (cur_char >= 0x2190 && cur_char <= 0x23FF) || + (cur_char >= 0x2500 && cur_char <= 0x2775) || + (cur_char >= 0x2794 && cur_char <= 0x2BFF) || + (cur_char >= 0x2E00 && cur_char <= 0x2E7F) || + (cur_char >= 0x3001 && cur_char <= 0x3003) || + (cur_char >= 0x3008 && cur_char <= 0x3020) || + (cur_char == 0x3030) + ) { + return true; + } else if ( + (cur_char >= 0x0300 && cur_char <= 0x036f) || + (cur_char >= 0x1DC0 && cur_char <= 0x1DFF) || + (cur_char >= 0x20D0 && cur_char <= 0x20FF) || + (cur_char >= 0xFE00 && cur_char <= 0xFE0F) || + (cur_char >= 0xFE20 && cur_char <= 0xFE2F) || + (cur_char >= 0xE0100 && cur_char <= 0xE01EF) + ) { + return !is_first_char; + } else { + return false; + } + } +} + static bool eat_operators( TSLexer *lexer, const bool *valid_symbols, @@ -239,9 +351,17 @@ static bool eat_operators( enum TokenType *symbol_result ) { bool possible_operators[OPERATOR_COUNT]; + uint8_t reserved_operators[RESERVED_OP_COUNT]; for (int op_idx = 0; op_idx < OPERATOR_COUNT; op_idx++) { possible_operators[op_idx] = valid_symbols[OP_SYMBOLS[op_idx]]; } + for (int op_idx = 0; op_idx < RESERVED_OP_COUNT; op_idx++) { + reserved_operators[op_idx] = 1; + } + + bool possible_custom_operator = valid_symbols[CUSTOM_OPERATOR]; + int32_t first_char = lexer->lookahead; + int32_t last_examined_char = first_char; int32_t str_idx = 0; int32_t full_match = -1; @@ -305,12 +425,53 @@ static bool eat_operators( } } - if (encountered_op_count(possible_operators) == 0) { - break; + for (int op_idx = 0; op_idx < RESERVED_OP_COUNT; op_idx++) { + if (!reserved_operators[op_idx]) { + continue; + } + + if (RESERVED_OPS[op_idx][str_idx] == '\0') { + reserved_operators[op_idx] = 0; + continue; + } + + if (RESERVED_OPS[op_idx][str_idx] != lexer->lookahead) { + reserved_operators[op_idx] = 0; + continue; + } + + if (RESERVED_OPS[op_idx][str_idx + 1] == '\0') { + reserved_operators[op_idx] = 2; + continue; + } } + possible_custom_operator = possible_custom_operator && is_legal_custom_operator( + str_idx == 0, + first_char, + lexer->lookahead + ); + + uint32_t encountered_ops = encountered_op_count(possible_operators); + if (encountered_ops == 0) { + if (!possible_custom_operator) { + break; + } else if (mark_end && full_match == -1) { + lexer->mark_end(lexer); + } + } + + last_examined_char = lexer->lookahead; lexer->advance(lexer, false); str_idx += 1; + + if (encountered_ops == 0 && !is_legal_custom_operator( + str_idx == 0, + first_char, + lexer->lookahead + )) { + break; + } } if (full_match != -1) { @@ -318,6 +479,14 @@ static bool eat_operators( return true; } + if (possible_custom_operator && !any_reserved_ops(reserved_operators)) { + if ((last_examined_char != '<' || iswspace(lexer->lookahead)) && mark_end) { + lexer->mark_end(lexer); + } + *symbol_result = CUSTOM_OPERATOR; + return true; + } + return false; } @@ -580,6 +749,15 @@ bool tree_sitter_swift_external_scanner_scan( bool has_ws_result = (ws_directive != CONTINUE_PARSING_NOTHING_FOUND); + // Now consume comments (before custom operators so that those aren't treated as comments) + enum TokenType comment_result; + bool saw_comment = eat_comment(lexer, valid_symbols, /* mark_end */ true, &comment_result); + if (saw_comment) { + lexer->mark_end(lexer); + lexer->result_symbol = comment_result; + return true; + } + // Now consume any operators that might cause our whitespace to be suppressed. enum TokenType operator_result; bool saw_operator = eat_operators( @@ -600,14 +778,6 @@ bool tree_sitter_swift_external_scanner_scan( return true; } - enum TokenType comment_result; - bool saw_comment = eat_comment(lexer, valid_symbols, /* mark_end */ true, &comment_result); - if (saw_comment) { - lexer->mark_end(lexer); - lexer->result_symbol = comment_result; - return true; - } - // NOTE: this will consume any `#` characters it sees, even if it does not find a result. Keep // it at the end so that it doesn't interfere with special literals or selectors! enum TokenType raw_str_result; From 8752db87adfbe8d430c7eb5cb4cbbced6c573a33 Mon Sep 17 00:00:00 2001 From: Alex Pinkus Date: Sat, 3 Sep 2022 19:31:54 -0700 Subject: [PATCH 2/2] Add more repositories to top-repo tracking --- .github/workflows/top-repos.yml | 10 ++++++++++ grammar.js | 2 +- script-data/known_failures.txt | 2 +- script-data/top-repositories.txt | 10 ++++++++++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/top-repos.yml b/.github/workflows/top-repos.yml index 1c8edcc..af0270c 100644 --- a/.github/workflows/top-repos.yml +++ b/.github/workflows/top-repos.yml @@ -55,6 +55,16 @@ jobs: - 41 - 42 - 43 + - 44 + - 45 + - 46 + - 47 + - 48 + - 49 + - 50 + - 51 + - 52 + - 53 steps: - uses: actions/checkout@v2 - name: Use Node.js ${{ matrix.node-version }} diff --git a/grammar.js b/grammar.js index 242a447..ae8885a 100644 --- a/grammar.js +++ b/grammar.js @@ -853,7 +853,7 @@ module.exports = grammar({ prec.left( PRECS.lambda, seq( - "{", + choice("{", "^{"), optional($._lambda_type_declaration), optional($.statements), "}" diff --git a/script-data/known_failures.txt b/script-data/known_failures.txt index f4a371f..34d24cc 100644 --- a/script-data/known_failures.txt +++ b/script-data/known_failures.txt @@ -1 +1 @@ -firefox-ios/Shared/Functions.swift +ReactKit/ReactKitTests/OperationTests.swift diff --git a/script-data/top-repositories.txt b/script-data/top-repositories.txt index bf957d2..321d195 100644 --- a/script-data/top-repositories.txt +++ b/script-data/top-repositories.txt @@ -41,3 +41,13 @@ Nuke kean/Nuke 11.1.1 Swinject Swinject/Swinject 2.8.2 GRDB groue/GRDB.swift v6.0.0-beta.3 0 2 GRDB groue/GRDB.swift v6.0.0-beta.3 1 2 +Dance saoudrizwan/Dance v1.0.7 +StyleKit 146BC/StyleKit 0.7.0 +ReactKit ReactKit/ReactKit 0.12.0 +JASON delba/JASON 3.1.1 +Side-Menu Yalantis/Side-Menu.iOS 2.0.2 +C4iOS C4Labs/C4iOS 3.0.1 +Taylor izqui/Taylor 0.4.5 +Runes thoughtbot/Runes v5.1.0 +Overdrive saidsikira/Overdrive 0.3 +Tactile delba/Tactile 3.0.1