From 3f5d49819fd75b5ee9d8ef7d9470ec498c0c2947 Mon Sep 17 00:00:00 2001 From: KrosFire Date: Wed, 28 Aug 2024 18:55:29 +0200 Subject: [PATCH] FEAT - Created static Lexer and feature flags --- Cargo.toml | 11 + README.md | 10 + src/compiling/compiler.rs | 3 +- src/compiling/lexing/lexer.rs | 174 ++++----- src/compiling/lexing/lexer_static.rs | 511 +++++++++++++++++++++++++++ src/compiling/lexing/mod.rs | 24 +- src/compiling/mod.rs | 3 + 7 files changed, 631 insertions(+), 105 deletions(-) create mode 100644 src/compiling/lexing/lexer_static.rs diff --git a/Cargo.toml b/Cargo.toml index 3fa20a4..84f604f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,17 @@ authors = ["pawel.karas@icloud.com"] keywords = ["heraclitus", "compiler", "parser"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +default = ["compiler"] + +# Include the compiler and lexer +compiler = ["lexer_dynamic"] + +# Include the lexer +lexer_dynamic = [] + +# Include the static lexer +lexer_static = [] [dependencies] colored = "2.0.0" diff --git a/README.md b/README.md index f97b8e8..2a411b1 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,16 @@ let cc = Compiler::new("HerbScript", rules); let tokens = cc.tokenize()?; ``` +## Features + +You can import specific features from Heraclitus. Available options are: + +- `compiler` - Includes Compiler and dynamic Lexer +- `lexer_dynamic` - Includes just the dynamic Lexer +- `lexer_static` - Includes just the static Lexer + +The difference between `static` and `dynamic` Lexer is that the `static` Lexer doesn't mutate it's own state while tokenizing the input. + # Change log 🚀 ## Version 1.7.8 diff --git a/src/compiling/compiler.rs b/src/compiling/compiler.rs index 43c434e..a690e70 100644 --- a/src/compiling/compiler.rs +++ b/src/compiling/compiler.rs @@ -2,11 +2,12 @@ use capitalize::Capitalize; use std::fs::File; use std::io::prelude::*; use crate::compiling_rules::Rules; -use crate::compiling::{Token, Lexer, LexerError, LexerErrorType, Metadata, SyntaxModule}; +use crate::compiling::{Token, LexerError, LexerErrorType, Metadata, SyntaxModule}; use crate::compiling::failing::message::Message; use crate::compiling::failing::failure::Failure; use crate::error_pos; +use super::lexer::Lexer; /// How do you want to separate expressions? /// diff --git a/src/compiling/lexing/lexer.rs b/src/compiling/lexing/lexer.rs index e22da83..2fe4a53 100644 --- a/src/compiling/lexing/lexer.rs +++ b/src/compiling/lexing/lexer.rs @@ -1,25 +1,18 @@ -use crate::compiling::{ Compiler, Token, SeparatorMode, ScopingMode }; +//! Dynamic Lexer +//! +//! This module contains the dynamic lexer that is used to tokenize the source code. + use super::compound_handler::{CompoundHandler, CompoundReaction}; -use super::region_handler::{ RegionHandler, RegionReaction }; use super::reader::Reader; +use super::region_handler::{RegionHandler, RegionReaction}; +use super::{LexerError, LexerErrorType}; use crate::compiling::failing::position_info::PositionInfo; +use crate::compiling::{Compiler, ScopingMode, SeparatorMode, Token}; // This is just an estimation of token amount // inside of a typical 200-lined file. const AVG_TOKEN_AMOUNT: usize = 1024; -/// Lexer's error type -#[derive(Debug)] -pub enum LexerErrorType { - /// Unspillable region has been spilled - Singleline, - /// Given region left unclosed - Unclosed -} - -/// Type containing full error of lexer -pub type LexerError = (LexerErrorType, PositionInfo); - /// The Lexer /// /// Lexer takes source code in a form of a string and translates it to a list of tokens. @@ -40,7 +33,7 @@ pub struct Lexer<'a> { is_escaped: bool, position: (usize, usize), index: usize, - token_start_index: usize + token_start_index: usize, } impl<'a> Lexer<'a> { @@ -60,7 +53,7 @@ impl<'a> Lexer<'a> { is_escaped: false, position: (0, 0), index: 0, - token_start_index: 0 + token_start_index: 0, } } @@ -79,7 +72,9 @@ impl<'a> Lexer<'a> { }); self.position = (0, 0); String::new() - } else { word } + } else { + word + } } /// Add word that has been completed in previous iteration to the lexem @@ -89,12 +84,13 @@ impl<'a> Lexer<'a> { self.lexem.push(Token { word, pos: self.position, - start: self.token_start_index + start: self.token_start_index, }); self.position = (0, 0); String::new() + } else { + word } - else { word } } /// Add word that has been completed in current iteration to the lexem @@ -104,12 +100,13 @@ impl<'a> Lexer<'a> { self.lexem.push(Token { word, pos: self.position, - start: self.token_start_index + start: self.token_start_index, }); self.position = (0, 0); String::new() + } else { + word } - else { word } } /// Checks whether this is a nontokenizable region @@ -117,8 +114,9 @@ impl<'a> Lexer<'a> { pub fn is_tokenized_region(&self, reaction: &RegionReaction) -> bool { if let Some(region) = self.region.get_region().as_ref() { region.tokenize && *reaction == RegionReaction::Pass + } else { + false } - else { false } } /// Pattern code for adding a symbol @@ -208,7 +206,7 @@ impl<'a> Lexer<'a> { // Normally start a new region word = self.pattern_begin(word, letter); } - }, + } // If the region has been closed // Add the closing region and finish the word RegionReaction::End(tokenize) => { @@ -230,7 +228,11 @@ impl<'a> Lexer<'a> { } } RegionReaction::Pass => { - match self.compound.handle_compound(letter, &self.reader, self.is_tokenized_region(&reaction)) { + match self.compound.handle_compound( + letter, + &self.reader, + self.is_tokenized_region(&reaction), + ) { CompoundReaction::Begin => word = self.pattern_begin(word, letter), CompoundReaction::Keep => self.word_push(&mut word, letter), CompoundReaction::End => word = self.pattern_end(word, letter), @@ -239,7 +241,8 @@ impl<'a> Lexer<'a> { if !self.is_tokenized_region(&reaction) { let region = self.region.get_region().unwrap(); // Flip escaped key - self.is_escaped = (!self.is_escaped && letter == self.escape_symbol) + self.is_escaped = (!self.is_escaped + && letter == self.escape_symbol) .then(|| !self.is_escaped) .unwrap_or(false); // Handle singleline attribute @@ -247,13 +250,12 @@ impl<'a> Lexer<'a> { let pos = self.reader.get_position(); return Err(( LexerErrorType::Singleline, - PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name.clone()) - )) + PositionInfo::at_pos(self.path.clone(), pos, 0) + .data(region.name.clone()), + )); } self.word_push(&mut word, letter); - } - else { - + } else { /******************/ /* Mode modifiers */ /******************/ @@ -278,14 +280,14 @@ impl<'a> Lexer<'a> { is_indenting = false; } } - continue + continue; } } // Skip newline character if we want to manually insert semicolons if let SeparatorMode::Manual = self.separator_mode { if letter == '\n' { word = self.add_word(word); - continue + continue; } } @@ -316,7 +318,7 @@ impl<'a> Lexer<'a> { if let Err((pos, region)) = self.region.is_region_closed(&self.reader) { return Err(( LexerErrorType::Unclosed, - PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name) + PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name), )); } Ok(()) @@ -325,24 +327,22 @@ impl<'a> Lexer<'a> { #[cfg(test)] mod test { - use crate::compiling_rules::{ Region, Rules }; + use crate::compiling::{Compiler, ScopingMode}; + use crate::compiling_rules::{Region, Rules}; use crate::reg; - use crate::compiling::{ Compiler, ScopingMode }; #[test] fn test_lexer_base() { let symbols = vec!['(', ')']; - let regions = reg![ - reg!(string as "String literal" => { - begin: "'", - end: "'" - } => [ - reg!(array as "Array Literal" => { - begin: "[", - end: "]" - }) - ]) - ]; + let regions = reg![reg!(string as "String literal" => { + begin: "'", + end: "'" + } => [ + reg!(array as "Array Literal" => { + begin: "[", + end: "]" + }) + ])]; let expected = vec![ ("let".to_string(), 1, 1), ("a".to_string(), 1, 5), @@ -351,7 +351,7 @@ mod test { ("12".to_string(), 1, 10), ("+".to_string(), 1, 13), ("32".to_string(), 1, 15), - (")".to_string(), 1, 17) + (")".to_string(), 1, 17), ]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("TestScript", rules); @@ -370,18 +370,16 @@ mod test { #[test] fn test_lexer_string_interp() { let symbols = vec!['(', ')']; - let regions = reg![ - reg!(string_literal as "String literal" => { - begin: "'", - end: "'" - } => [ - reg!(string_interp as "String interpolation" => { - begin: "{", - end: "}", - tokenize: true - } ref global) - ]) - ]; + let regions = reg![reg!(string_literal as "String literal" => { + begin: "'", + end: "'" + } => [ + reg!(string_interp as "String interpolation" => { + begin: "{", + end: "}", + tokenize: true + } ref global) + ])]; let expected = vec![ ("let".to_string(), 1, 1), ("a".to_string(), 1, 5), @@ -394,7 +392,7 @@ mod test { ("}".to_string(), 1, 30), (" long'".to_string(), 1, 31), ("}".to_string(), 1, 37), - (" 🎉 text'".to_string(), 1, 38) + (" 🎉 text'".to_string(), 1, 38), ]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("TestScript", rules); @@ -423,16 +421,12 @@ mod test { ("subcondition".to_string(), (2, 8), 21), (":".to_string(), (2, 20), 33), ("\n ".to_string(), (3, 1), 34), - ("pass".to_string(), (3, 9), 43) + ("pass".to_string(), (3, 9), 43), ]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("Testhon", rules); cc.scoping_mode = ScopingMode::Indent; - cc.load(vec![ - "if condition:", - " if subcondition:", - " pass" - ].join("\n")); + cc.load(vec!["if condition:", " if subcondition:", " pass"].join("\n")); let mut lexer = super::Lexer::new(&cc); let mut result = vec![]; // Simulate lexing @@ -455,15 +449,11 @@ mod test { ("12".to_string(), 1, 11), ("+".to_string(), 2, 1), ("12".to_string(), 3, 1), - (";".to_string(), 3, 3) + (";".to_string(), 3, 3), ]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("Testhon", rules); - cc.load(vec![ - "let age = 12", - "+", - "12;" - ].join("\n")); + cc.load(vec!["let age = 12", "+", "12;"].join("\n")); let mut lexer = super::Lexer::new(&cc); let mut result = vec![]; // Simulate lexing @@ -478,24 +468,14 @@ mod test { #[test] fn test_lexer_multiline_regions() { let symbols = vec![';', '+', '=']; - let regions = reg![ - reg!(string as "String" => { - begin: "'", - end: "'" - }) - ]; - let expected = vec![ - ("'this\nis\na\nmultiline\nstring'".to_string(), 1, 1) - ]; + let regions = reg![reg!(string as "String" => { + begin: "'", + end: "'" + })]; + let expected = vec![("'this\nis\na\nmultiline\nstring'".to_string(), 1, 1)]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("Test", rules); - cc.load(vec![ - "'this", - "is", - "a", - "multiline", - "string'", - ].join("\n")); + cc.load(vec!["'this", "is", "a", "multiline", "string'"].join("\n")); let mut lexer = super::Lexer::new(&cc); let mut result = vec![]; // Simulate lexing @@ -510,20 +490,14 @@ mod test { #[test] fn test_lexer_escaped_regions() { let symbols = vec![';', '+', '=']; - let regions = reg![ - reg!(string as "String" => { - begin: "\"", - end: "\"" - }) - ]; - let expected = vec![ - ("\"this is \\\"escaped\\\" string\"".to_string(), 1, 1) - ]; + let regions = reg![reg!(string as "String" => { + begin: "\"", + end: "\"" + })]; + let expected = vec![("\"this is \\\"escaped\\\" string\"".to_string(), 1, 1)]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("Test", rules); - cc.load(vec![ - "\"this is \\\"escaped\\\" string\"" - ].join("\n")); + cc.load(vec!["\"this is \\\"escaped\\\" string\""].join("\n")); let mut lexer = super::Lexer::new(&cc); let mut result = vec![]; // Simulate lexing diff --git a/src/compiling/lexing/lexer_static.rs b/src/compiling/lexing/lexer_static.rs new file mode 100644 index 0000000..254e6bb --- /dev/null +++ b/src/compiling/lexing/lexer_static.rs @@ -0,0 +1,511 @@ +//! Static Lexer +//! +//! This module contains the static lexer that is used to tokenize the source code + +use crate::{ + compiling_rules::Rules, + prelude::{PositionInfo, ScopingMode, SeparatorMode, Token}, +}; + +use super::{ + compound_handler::{CompoundHandler, CompoundReaction}, + reader::Reader, + region_handler::{RegionHandler, RegionReaction}, + LexerError, LexerErrorType, +}; + +/// Static Lexer +pub struct StaticLexer { + rules: Rules, + /// Path to the lexed file + pub path: Option, + /// Separator mode for this lexer + pub separator_mode: SeparatorMode, + /// Escape symbol for this lexer. Default is '\\' + pub escape_symbol: char, + /// Scoping mode for this lexer + pub scoping_mode: ScopingMode, +} + +struct LexState<'a> { + word: String, + is_indenting: bool, + is_escaped: bool, + token_start_index: usize, + position: (usize, usize), + reader: Reader<'a>, + lexem: Vec, + region_handler: RegionHandler, + compound_handler: CompoundHandler, +} + +impl StaticLexer { + /// Create a new Lexer based on the compiler metadata + pub fn new(rules: Rules) -> Self { + StaticLexer { + rules, + path: None, + separator_mode: SeparatorMode::Manual, + escape_symbol: '\\', + scoping_mode: ScopingMode::Block, + } + } + + /// Add indentation to the lexem + #[inline] + fn add_indent(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; + } + + // Getting position by word here would attempt to + // substract with overflow since the new line character + // technically belongs to the previous line + let (row, _col) = lex_state.reader.get_position(); + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: (row, 1), + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new(); + } + + /// Add word that has been completed in previous iteration to the lexem + #[inline] + fn add_word(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; + } + + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: lex_state.position, + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new(); + } + + /// Add word that has been completed in current iteration to the lexem + #[inline] + fn add_word_inclusively(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; + } + + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: lex_state.position, + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new() + } + + /// Checks whether this is a nontokenizable region + #[inline] + fn is_tokenized_region(&self, reaction: &RegionReaction, lex_state: &mut LexState) -> bool { + if let Some(region) = lex_state.region_handler.get_region() { + region.tokenize && *reaction == RegionReaction::Pass + } else { + false + } + } + + /// Pattern code for adding a symbol + /// **[*]** + #[inline] + fn pattern_add_symbol(&self, lex_state: &mut LexState, letter: char) { + self.add_word(lex_state); + + if lex_state.word.is_empty() { + lex_state.token_start_index = lex_state.reader.get_index(); + } + self.word_push(lex_state, letter); + lex_state.position = lex_state.reader.get_position(); + + self.add_word_inclusively(lex_state); + } + + /// Pattern code for beginning a new region + /// **[** + #[inline] + fn pattern_begin(&self, lex_state: &mut LexState, letter: char) { + self.add_word(lex_state); + self.word_push(lex_state, letter); + } + + /// Pattern code for ending current region + /// **]** + #[inline] + fn pattern_end(&self, lex_state: &mut LexState, letter: char) { + self.word_push(lex_state, letter); + self.add_word_inclusively(lex_state); + } + + /// Push letter to the word and set token start index + fn word_push(&self, lex_state: &mut LexState, letter: char) { + if lex_state.word.is_empty() { + lex_state.token_start_index = lex_state.reader.get_index(); + } + lex_state.word.push(letter); + } + + /// Tokenize source code + /// + /// Run lexer and tokenize code. The result is stored in the lexem attribute + pub fn tokenize(&self, input: &str) -> Result, LexerError> { + let code = input.to_string(); + + let mut lex_state = LexState { + word: String::new(), + is_indenting: false, + is_escaped: false, + token_start_index: 0, + position: (0, 0), + lexem: Vec::new(), + reader: Reader::new(&code), + region_handler: RegionHandler::new(&self.rules), + compound_handler: CompoundHandler::new(&self.rules), + }; + + while let Some(letter) = lex_state.reader.next() { + /****************/ + /* Set Position */ + /****************/ + + // If the new position hasn't been set yet, set it + if lex_state.position == (0, 0) { + // If separator mode is set to Manual and the letter is a separator, + // then skip finding a new position + if SeparatorMode::Manual != self.separator_mode || letter != '\n' { + let region = lex_state.region_handler.get_region().unwrap(); + // If the region is tokenized, then check if the letter is a separator + if !region.tokenize || !vec![' ', '\t'].contains(&letter) { + lex_state.position = lex_state.reader.get_position(); + } + } + } + + // Reaction stores the reaction of the region handler + // Have we just opened or closed some region? + let reaction = lex_state + .region_handler + .handle_region(&lex_state.reader, lex_state.is_escaped); + match reaction { + // If the region has been opened + // Finish the part that we have been parsing + RegionReaction::Begin(tokenize) => { + // Also if the new region is an interpolation that tokenizes + // the inner content - separate the region from the content + if tokenize { + self.pattern_add_symbol(&mut lex_state, letter); + } + // Regular region case + else { + // This is supposed to prevent overshadowing new line + // character if region rule opens with newline + if letter == '\n' { + // This additionally creates a new token + self.pattern_add_symbol(&mut lex_state, letter); + } + // Normally start a new region + self.pattern_begin(&mut lex_state, letter); + } + } + // If the region has been closed + // Add the closing region and finish the word + RegionReaction::End(tokenize) => { + // Also if the new region is an interpolation that tokenizes + // the inner content - separate the region from the content + if tokenize { + self.pattern_add_symbol(&mut lex_state, letter); + } + // Regular region case + else { + // Normally close the region + self.pattern_end(&mut lex_state, letter); + // This is supposed to prevent overshadowing new line + // character if region rule closes with newline + if letter == '\n' { + // This additionally creates a new token + self.pattern_add_symbol(&mut lex_state, letter); + } + } + } + RegionReaction::Pass => { + let is_tokenized_region = self.is_tokenized_region(&reaction, &mut lex_state); + match lex_state.compound_handler.handle_compound( + letter, + &lex_state.reader, + is_tokenized_region, + ) { + CompoundReaction::Begin => self.pattern_begin(&mut lex_state, letter), + CompoundReaction::Keep => self.word_push(&mut lex_state, letter), + CompoundReaction::End => self.pattern_end(&mut lex_state, letter), + CompoundReaction::Pass => { + // Handle region scope + if !self.is_tokenized_region(&reaction, &mut lex_state) { + let region = lex_state.region_handler.get_region().unwrap(); + // Flip escaped key + lex_state.is_escaped = (!lex_state.is_escaped + && letter == self.escape_symbol) + .then(|| !lex_state.is_escaped) + .unwrap_or(false); + // Handle singleline attribute + if letter == '\n' && region.singleline { + let pos = lex_state.reader.get_position(); + return Err(( + LexerErrorType::Singleline, + PositionInfo::at_pos(self.path.clone(), pos, 0) + .data(region.name.clone()), + )); + } + self.word_push(&mut lex_state, letter); + } else { + /******************/ + /* Mode modifiers */ + /******************/ + + // Create indent regions: '\n ' + if let ScopingMode::Indent = self.scoping_mode { + // If we are still in the indent region - proceed + if lex_state.is_indenting && vec![' ', '\t'].contains(&letter) { + self.word_push(&mut lex_state, letter); + } + // If it's the new line - start indent region + if letter == '\n' { + lex_state.is_indenting = true; + self.pattern_begin(&mut lex_state, letter); + } + // Check if the current letter + // concludes current indent region + if lex_state.is_indenting { + if let Some(next_char) = lex_state.reader.peek() { + if !vec![' ', '\t'].contains(&next_char) { + self.add_indent(&mut lex_state); + lex_state.is_indenting = false; + } + } + continue; + } + } + // Skip newline character if we want to manually insert semicolons + if let SeparatorMode::Manual = self.separator_mode { + if letter == '\n' { + self.add_word(&mut lex_state); + continue; + } + } + + /*****************/ + /* Regular Lexer */ + /*****************/ + + // Skip whitespace + if vec![' ', '\t'].contains(&letter) { + self.add_word(&mut lex_state); + } + // Handle special symbols + else if self.rules.symbols.contains(&letter) || letter == '\n' { + self.pattern_add_symbol(&mut lex_state, letter); + } + // Handle word + else { + self.word_push(&mut lex_state, letter); + } + } + } + } + } + } + } + self.add_word(&mut lex_state); + // If some region exists that was not closed + if let Err((pos, region)) = lex_state.region_handler.is_region_closed(&lex_state.reader) { + return Err(( + LexerErrorType::Unclosed, + PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name), + )); + } + + Ok(lex_state.lexem) + } +} + +#[cfg(test)] +mod test { + use crate::compiling::ScopingMode; + use crate::compiling_rules::{Region, Rules}; + use crate::reg; + + #[test] + fn test_lexer_base() { + let symbols = vec!['(', ')']; + let regions = reg![reg!(string as "String literal" => { + begin: "'", + end: "'" + } => [ + reg!(array as "Array Literal" => { + begin: "[", + end: "]" + }) + ])]; + let expected = vec![ + ("let".to_string(), 1, 1), + ("a".to_string(), 1, 5), + ("=".to_string(), 1, 7), + ("(".to_string(), 1, 9), + ("12".to_string(), 1, 10), + ("+".to_string(), 1, 13), + ("32".to_string(), 1, 15), + (")".to_string(), 1, 17), + ]; + let rules = Rules::new(symbols, vec![], regions); + let lexer = super::StaticLexer::new(rules); + let mut result = vec![]; + // Simulate lexing + let res = lexer.tokenize("let a = (12 + 32)"); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, lex.pos.0, lex.pos.1)); + } + assert_eq!(expected, result); + } + + #[test] + fn test_lexer_string_interp() { + let symbols = vec!['(', ')']; + let regions = reg![reg!(string_literal as "String literal" => { + begin: "'", + end: "'" + } => [ + reg!(string_interp as "String interpolation" => { + begin: "{", + end: "}", + tokenize: true + } ref global) + ])]; + let expected = vec![ + ("let".to_string(), 1, 1), + ("a".to_string(), 1, 5), + ("=".to_string(), 1, 7), + ("'this ".to_string(), 1, 9), + ("{".to_string(), 1, 15), + ("'is ".to_string(), 1, 16), + ("{".to_string(), 1, 20), + ("adjective".to_string(), 1, 21), + ("}".to_string(), 1, 30), + (" long'".to_string(), 1, 31), + ("}".to_string(), 1, 37), + (" 🎉 text'".to_string(), 1, 38), + ]; + let rules = Rules::new(symbols, vec![], regions); + + let lexer = super::StaticLexer::new(rules); + let mut result = vec![]; + // Simulate lexing + let res = lexer.tokenize("let a = 'this {'is {adjective} long'} 🎉 text'"); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, lex.pos.0, lex.pos.1)); + } + assert_eq!(expected, result); + } + + #[test] + fn test_lexer_indent_scoping_mode() { + let symbols = vec![':']; + let regions = reg![]; + let expected = vec![ + ("if".to_string(), (1, 1), 0), + ("condition".to_string(), (1, 4), 3), + (":".to_string(), (1, 13), 12), + ("\n ".to_string(), (2, 1), 13), + ("if".to_string(), (2, 5), 18), + ("subcondition".to_string(), (2, 8), 21), + (":".to_string(), (2, 20), 33), + ("\n ".to_string(), (3, 1), 34), + ("pass".to_string(), (3, 9), 43), + ]; + let rules = Rules::new(symbols, vec![], regions); + + let mut lexer = super::StaticLexer::new(rules); + lexer.scoping_mode = ScopingMode::Indent; + let mut result = vec![]; + // Simulate lexing + let res = lexer + .tokenize(&vec!["if condition:", " if subcondition:", " pass"].join("\n")); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, (lex.pos.0, lex.pos.1), lex.start)); + } + assert_eq!(expected, result); + } + + #[test] + fn test_lexer_manual_separator_mode() { + let symbols = vec![';', '+', '=']; + let regions = reg![]; + let expected = vec![ + ("let".to_string(), 1, 1), + ("age".to_string(), 1, 5), + ("=".to_string(), 1, 9), + ("12".to_string(), 1, 11), + ("+".to_string(), 2, 1), + ("12".to_string(), 3, 1), + (";".to_string(), 3, 3), + ]; + let rules = Rules::new(symbols, vec![], regions); + let lexer = super::StaticLexer::new(rules); + let mut result = vec![]; + // Simulate lexing + let res = lexer.tokenize(&vec!["let age = 12", "+", "12;"].join("\n")); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, lex.pos.0, lex.pos.1)); + } + assert_eq!(expected, result); + } + + #[test] + fn test_lexer_multiline_regions() { + let symbols = vec![';', '+', '=']; + let regions = reg![reg!(string as "String" => { + begin: "'", + end: "'" + })]; + let expected = vec![("'this\nis\na\nmultiline\nstring'".to_string(), 1, 1)]; + let rules = Rules::new(symbols, vec![], regions); + let lexer = super::StaticLexer::new(rules); + let mut result = vec![]; + // Simulate lexing + let res = lexer.tokenize(&vec!["'this", "is", "a", "multiline", "string'"].join("\n")); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, lex.pos.0, lex.pos.1)); + } + assert_eq!(expected, result); + } + + #[test] + fn test_lexer_escaped_regions() { + let symbols = vec![';', '+', '=']; + let regions = reg![reg!(string as "String" => { + begin: "\"", + end: "\"" + })]; + let expected = vec![("\"this is \\\"escaped\\\" string\"".to_string(), 1, 1)]; + let rules = Rules::new(symbols, vec![], regions); + let lexer = super::StaticLexer::new(rules); + let mut result = vec![]; + // Simulate lexing + let res = lexer.tokenize(&vec!["\"this is \\\"escaped\\\" string\""].join("\n")); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, lex.pos.0, lex.pos.1)); + } + assert_eq!(expected, result); + } +} diff --git a/src/compiling/lexing/mod.rs b/src/compiling/lexing/mod.rs index 7257ebc..b53f82e 100644 --- a/src/compiling/lexing/mod.rs +++ b/src/compiling/lexing/mod.rs @@ -1,9 +1,25 @@ //! Lexer module -//! +//! //! This module holds all the lexer related modules +use crate::prelude::PositionInfo; + mod compound_handler; -mod region_handler; +#[cfg(feature = "lexer_dynamic")] +pub mod lexer; +#[cfg(feature = "lexer_static")] +pub mod lexer_static; mod reader; -mod lexer; -pub use lexer::*; \ No newline at end of file +mod region_handler; + +/// Lexer's error type +#[derive(Debug)] +pub enum LexerErrorType { + /// Unspillable region has been spilled + Singleline, + /// Given region left unclosed + Unclosed, +} + +/// Type containing full error of lexer +pub type LexerError = (LexerErrorType, PositionInfo); diff --git a/src/compiling/mod.rs b/src/compiling/mod.rs index a305a2f..534a476 100644 --- a/src/compiling/mod.rs +++ b/src/compiling/mod.rs @@ -4,12 +4,15 @@ //! that helps you tokenize your code or even parse it entirely. mod lexing; + +#[cfg(feature = "compiler")] mod compiler; mod token; mod parser; pub mod failing; pub use lexing::*; +#[cfg(feature = "compiler")] pub use compiler::*; pub use token::*; pub use parser::*;