line:col positions in parser

enso-org · Nov 1, 2023 · 5acbcd2 · 5acbcd2
1 parent 8bc17bd
commit 5acbcd2
Show file tree

Hide file tree

Showing 13 changed files with 446 additions and 257 deletions.
diff --git a/app/gui2/parser-codegen/util.ts b/app/gui2/parser-codegen/util.ts
@@ -21,12 +21,12 @@ const RENAME = new Map([
   // Rename source references to reflect our usage:
   // - In `Tree`s:
   ['spanLeftOffsetCodeOffsetUtf16', 'whitespaceStartInCodeParsed'],
-  ['spanLeftOffsetCodeUtf16', 'whitespaceLengthInCodeParsed'],
+  ['spanLeftOffsetCodeLenUtf16', 'whitespaceLengthInCodeParsed'],
   ['spanCodeLengthUtf16', 'childrenLengthInCodeParsed'],
   // - In `Tokens`s:
   ['leftOffsetCodeOffsetUtf16', 'whitespaceStartInCodeBuffer'],
-  ['leftOffsetCodeUtf16', 'whitespaceLengthInCodeBuffer'],
-  ['codeUtf16', 'lengthInCodeBuffer'],
+  ['leftOffsetCodeLenUtf16', 'whitespaceLengthInCodeBuffer'],
+  ['codeLenUtf16', 'lengthInCodeBuffer'],
   ['codeOffsetUtf16', 'startInCodeBuffer'],
 ])
 

diff --git a/lib/rust/parser/debug/src/bin/lexer.rs b/lib/rust/parser/debug/src/bin/lexer.rs
@@ -29,5 +29,5 @@ pub fn main() {
     use std::io::Read;
     let mut input = String::new();
     std::io::stdin().read_to_string(&mut input).unwrap();
-    println!("{:#?}", enso_parser::lexer::run(&input));
+    println!("{:#?}", enso_parser::lexer::debug::lex_and_validate_spans(&input));
 }
diff --git a/lib/rust/parser/debug/src/lib.rs b/lib/rust/parser/debug/src/lib.rs
@@ -23,12 +23,12 @@
 #![warn(unused_qualifications)]
 
 use enso_metamodel_lexpr::ToSExpr;
+use enso_parser::source::code::debug::LocationCheck;
 use enso_reflect::Reflect;
 use lexpr::Value;
 use std::collections::HashSet;
 
 
-
 // =====================
 // === S-expressions ===
 // =====================
@@ -122,10 +122,18 @@ fn strip_hidden_fields(tree: Value) -> Value {
         ":spanLeftOffsetVisible",
         ":spanLeftOffsetCodeReprBegin",
         ":spanLeftOffsetCodeReprLen",
-        ":spanLeftOffsetCodeUtf16",
+        ":spanLeftOffsetCodeLenUtf8",
+        ":spanLeftOffsetCodeLenUtf16",
+        ":spanLeftOffsetCodeLenNewlines",
+        ":spanLeftOffsetCodeLenLineChars",
+        ":spanLeftOffsetCodeOffsetUtf8",
         ":spanLeftOffsetCodeOffsetUtf16",
+        ":spanLeftOffsetCodeOffsetLine",
+        ":spanLeftOffsetCodeOffsetCol",
         ":spanCodeLengthUtf8",
         ":spanCodeLengthUtf16",
+        ":spanCodeLengthNewlines",
+        ":spanCodeLengthLineChars",
     ];
     let hidden_tree_fields: HashSet<_> = hidden_tree_fields.into_iter().collect();
     Value::list(tree.to_vec().unwrap().into_iter().filter(|val| match val {
@@ -194,7 +202,11 @@ fn tuplify(value: Value) -> Value {
 
 /// Check the internal consistency of the `Tree` and `Token` spans from the given root, and validate
 /// that every character in the given range is covered exactly once in the token spans.
-pub fn validate_spans(tree: &enso_parser::syntax::tree::Tree, expected_span: std::ops::Range<u32>) {
+pub fn validate_spans(
+    tree: &enso_parser::syntax::tree::Tree,
+    expected_span: std::ops::Range<u32>,
+    locations: &mut LocationCheck,
+) {
     let mut sum_span = None;
     fn concat<T: PartialEq + std::fmt::Debug + Copy>(
         a: &Option<std::ops::Range<T>>,
@@ -208,24 +220,33 @@ pub fn validate_spans(tree: &enso_parser::syntax::tree::Tree, expected_span: std
             None => b.clone(),
         }
     }
-    sum_span = Some(concat(&sum_span, &tree.span.left_offset.code.range_utf16()));
+    sum_span = Some(concat(&sum_span, &tree.span.left_offset.code.range()));
     tree.visit_items(|item| match item {
         enso_parser::syntax::item::Ref::Token(token) => {
             if !(token.left_offset.is_empty() && token.code.is_empty()) {
-                sum_span = Some(concat(&sum_span, &token.left_offset.code.range_utf16()));
-                sum_span = Some(concat(&sum_span, &token.code.range_utf16()));
+                sum_span = Some(concat(&sum_span, &token.left_offset.code.range()));
+                sum_span = Some(concat(&sum_span, &token.code.range()));
             }
+            let left_offset = token.left_offset.code.range();
+            let code = token.code.range();
+            locations.extend(&[left_offset.start, left_offset.end, code.start, code.end]);
         }
         enso_parser::syntax::item::Ref::Tree(tree) => {
             let children_span =
-                concat(&Some(tree.span.left_offset.code.range_utf16()), &tree.span.range_utf16());
-            validate_spans(tree, children_span.clone());
+                concat(&Some(tree.span.left_offset.code.range()), &tree.span.range());
+            let children_span_ = children_span.start.utf16..children_span.end.utf16;
+            validate_spans(tree, children_span_, locations);
             sum_span = Some(concat(&sum_span, &children_span));
+            let left_offset = tree.span.left_offset.code.range();
+            let code = tree.span.range();
+            locations.extend(&[left_offset.start, left_offset.end, code.start, code.end]);
         }
     });
     if expected_span.is_empty() {
         assert!(sum_span.map_or(true, |range| range.is_empty()));
     } else {
-        assert_eq!(sum_span.unwrap(), expected_span);
+        let sum_span = sum_span.unwrap_or_default();
+        let sum_span = sum_span.start.utf16..sum_span.end.utf16;
+        assert_eq!(sum_span, expected_span);
     }
 }
diff --git a/lib/rust/parser/debug/src/main.rs b/lib/rust/parser/debug/src/main.rs
@@ -41,10 +41,12 @@ fn check_file(path: &str, mut code: &str) {
     }
     let ast = enso_parser::Parser::new().run(code);
     let expected_span = 0..(code.encode_utf16().count() as u32);
-    enso_parser_debug::validate_spans(&ast, expected_span);
+    let mut locations = enso_parser::source::code::debug::LocationCheck::new();
+    enso_parser_debug::validate_spans(&ast, expected_span, &mut locations);
     for (parsed, original) in ast.code().lines().zip(code.lines()) {
         assert_eq!(parsed, original, "Bug: dropped tokens, while parsing: {path}");
     }
+    locations.check(code);
     let s_expr = enso_parser_debug::to_s_expr(&ast, code);
     println!("{s_expr}");
 }
diff --git a/lib/rust/parser/debug/tests/parse.rs b/lib/rust/parser/debug/tests/parse.rs
@@ -482,6 +482,13 @@ fn dot_operator_blocks() {
 
 #[test]
 fn code_block_argument_list() {
+    #[rustfmt::skip]
+    let code = [
+        "foo",
+        "    bar",
+    ];
+    test!(&code.join("\n"), (ArgumentBlockApplication (Ident foo) #((Ident bar))));
+
     #[rustfmt::skip]
     let code = [
         "value = foo",
@@ -492,7 +499,6 @@ fn code_block_argument_list() {
     ];
     test(&code.join("\n"), expect);
 
-
     #[rustfmt::skip]
     let code = [
         "value = foo",
@@ -1012,28 +1018,19 @@ x"#;
 
 #[test]
 fn interpolated_literals_in_inline_text() {
-    #[rustfmt::skip]
-    let cases = [
-        (r#"'Simple case.'"#, block![(TextLiteral #((Section "Simple case.")))]),
-        (r#"'With a `splice`.'"#, block![(TextLiteral
-            #((Section "With a ")
-              (Splice (Ident splice))
-              (Section ".")))]),
-        (r#"'` SpliceWithLeadingWhitespace`'"#, block![(TextLiteral
-            #((Splice (Ident SpliceWithLeadingWhitespace))))]),
-        (r#"'String with \n escape'"#, block![
-            (TextLiteral
-             #((Section "String with ") (Escape '\n') (Section " escape")))]),
-        (r#"'\x0Aescape'"#, block![
-            (TextLiteral #((Escape '\n') (Section "escape")))]),
-        (r#"'\u000Aescape'"#, block![
-            (TextLiteral #((Escape '\n') (Section "escape")))]),
-        (r#"'\u{0000A}escape'"#, block![
-            (TextLiteral #((Escape '\n') (Section "escape")))]),
-        (r#"'\U0000000Aescape'"#, block![
-            (TextLiteral #((Escape '\n') (Section "escape")))]),
-    ];
-    cases.into_iter().for_each(|(code, expected)| test(code, expected));
+    test!(r#"'Simple case.'"#, (TextLiteral #((Section "Simple case."))));
+    test!(r#"'With a `splice`.'"#, (TextLiteral
+        #((Section "With a ")
+          (Splice (Ident splice))
+          (Section "."))));
+    test!(r#"'` SpliceWithLeadingWhitespace`'"#,
+        (TextLiteral #((Splice (Ident SpliceWithLeadingWhitespace)))));
+    test!(r#"'String with \n escape'"#,
+        (TextLiteral #((Section "String with ") (Escape '\n') (Section " escape"))));
+    test!(r#"'\x0Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
+    test!(r#"'\u000Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
+    test!(r#"'\u{0000A}escape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
+    test!(r#"'\U0000000Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
 }
 
 #[test]
@@ -1580,7 +1577,9 @@ fn test(code: &str, expect: lexpr::Value) {
 fn parse(code: &str) -> enso_parser::syntax::tree::Tree {
     let ast = enso_parser::Parser::new().run(code);
     let expected_span = 0..(code.encode_utf16().count() as u32);
-    enso_parser_debug::validate_spans(&ast, expected_span);
+    let mut locations = enso_parser::source::code::debug::LocationCheck::new();
+    enso_parser_debug::validate_spans(&ast, expected_span, &mut locations);
+    locations.check(code);
     ast
 }
 

diff --git a/lib/rust/parser/debug/tools/parse_all_enso_files.sh b/lib/rust/parser/debug/tools/parse_all_enso_files.sh
@@ -15,7 +15,12 @@
 set -e
 
 cargo build -p enso-parser-debug --bin enso-parser-debug
+cargo build -p enso-parser-debug --bin lexer
 
 ENSO_FILES=$(find distribution/ test/ -name '*.enso' -print | sort)
-for x in $ENSO_FILES; do echo -n "$x "; target/rust/debug/enso-parser-debug <$x; done
+for x in $ENSO_FILES; do
+	echo -n "$x "
+	target/rust/debug/lexer <$x >/dev/null
+	target/rust/debug/enso-parser-debug <$x
+done
 
diff --git a/lib/rust/parser/generate-java/src/serialization.rs b/lib/rust/parser/generate-java/src/serialization.rs
@@ -19,6 +19,9 @@ const CODE_GETTER: &str = "codeRepr";
 const WHITESPACE_GETTER: &str = "getWhitespace";
 const TREE_BEGIN: &str = "fieldSpanLeftOffsetCodeReprBegin";
 const TREE_LEN: &str = "fieldSpanLeftOffsetCodeReprLen";
+const TREE_WHITESPACE: &str = "fieldSpanLeftOffsetCodeLenUtf16";
+const TOKEN_WHITESPACE: &str = "fieldLeftOffsetCodeLenUtf16";
+const TOKEN_CODE_LENGTH: &str = "fieldCodeLenUtf16";
 
 /// Derive deserialization for all types in the typegraph.
 pub fn derive(graph: &mut TypeGraph, tree: ClassId, token: ClassId) {
@@ -151,16 +154,16 @@ fn start_whitespace() -> impl for<'a> Fn(MaterializerInput<'a>) -> String + 'sta
     |MaterializerInput { message }| format!("{message}.position()")
 }
 fn start_code_tree() -> impl for<'a> Fn(MaterializerInput<'a>) -> String + 'static {
-    |MaterializerInput { message }| format!("{message}.advance(fieldSpanLeftOffsetCodeUtf16)")
+    |MaterializerInput { message }| format!("{message}.advance({TREE_WHITESPACE})")
 }
 fn end_code_tree() -> impl for<'a> Fn(MaterializerInput<'a>) -> String + 'static {
     |MaterializerInput { message }| format!("{message}.position()")
 }
 fn start_code_token() -> impl for<'a> Fn(MaterializerInput<'a>) -> String + 'static {
-    |MaterializerInput { message }| format!("{message}.advance(fieldLeftOffsetCodeUtf16)")
+    |MaterializerInput { message }| format!("{message}.advance({TOKEN_WHITESPACE})")
 }
 fn end_code_token() -> impl for<'a> Fn(MaterializerInput<'a>) -> String + 'static {
-    |MaterializerInput { message }| format!("{message}.advance(fieldCodeUtf16)")
+    |MaterializerInput { message }| format!("{message}.advance({TOKEN_CODE_LENGTH})")
 }